aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/DocBook/genericirq.tmpl84
-rw-r--r--Documentation/DocBook/kernel-locking.tmpl14
-rw-r--r--Documentation/RCU/checklist.txt46
-rw-r--r--Documentation/RCU/stallwarn.txt18
-rw-r--r--Documentation/RCU/trace.txt13
-rw-r--r--Documentation/cputopology.txt23
-rw-r--r--Documentation/feature-removal-schedule.txt28
-rw-r--r--Documentation/kernel-parameters.txt11
-rw-r--r--MAINTAINERS17
-rw-r--r--arch/arm/include/asm/hw_irq.h2
-rw-r--r--arch/arm/kernel/irq.c10
-rw-r--r--arch/arm/mach-bcmring/dma.c4
-rw-r--r--arch/arm/mach-bcmring/irq.c6
-rw-r--r--arch/arm/mach-iop13xx/msi.c8
-rw-r--r--arch/ia64/include/asm/hardirq.h11
-rw-r--r--arch/ia64/include/asm/system.h4
-rw-r--r--arch/ia64/kernel/msi_ia64.c8
-rw-r--r--arch/ia64/sn/kernel/msi_sn.c4
-rw-r--r--arch/m32r/kernel/irq.c2
-rw-r--r--arch/m32r/platforms/m32104ut/setup.c2
-rw-r--r--arch/m32r/platforms/m32700ut/setup.c8
-rw-r--r--arch/m32r/platforms/mappi/setup.c2
-rw-r--r--arch/m32r/platforms/mappi2/setup.c2
-rw-r--r--arch/m32r/platforms/mappi3/setup.c2
-rw-r--r--arch/m32r/platforms/oaks32r/setup.c2
-rw-r--r--arch/m32r/platforms/opsput/setup.c6
-rw-r--r--arch/m32r/platforms/usrv/setup.c4
-rw-r--r--arch/mips/kernel/mips-mt-fpaff.c2
-rw-r--r--arch/powerpc/include/asm/system.h4
-rw-r--r--arch/powerpc/platforms/cell/axon_msi.c6
-rw-r--r--arch/powerpc/platforms/pseries/xics.c2
-rw-r--r--arch/powerpc/sysdev/fsl_msi.c4
-rw-r--r--arch/powerpc/sysdev/mpic_pasemi_msi.c22
-rw-r--r--arch/powerpc/sysdev/mpic_u3msi.c18
-rw-r--r--arch/s390/Kconfig7
-rw-r--r--arch/s390/include/asm/hardirq.h4
-rw-r--r--arch/s390/include/asm/system.h1
-rw-r--r--arch/s390/include/asm/topology.h27
-rw-r--r--arch/s390/kernel/topology.c150
-rw-r--r--arch/sh/kernel/irq.c2
-rw-r--r--arch/sparc/kernel/pci_msi.c8
-rw-r--r--arch/tile/kernel/irq.c4
-rw-r--r--arch/um/kernel/irq.c6
-rw-r--r--arch/x86/Kconfig122
-rw-r--r--arch/x86/Kconfig.debug4
-rw-r--r--arch/x86/Makefile8
-rw-r--r--arch/x86/include/asm/amd_iommu.h2
-rw-r--r--arch/x86/include/asm/amd_iommu_proto.h2
-rw-r--r--arch/x86/include/asm/amd_iommu_types.h23
-rw-r--r--arch/x86/include/asm/amd_nb.h (renamed from arch/x86/include/asm/k8.h)21
-rw-r--r--arch/x86/include/asm/apb_timer.h1
-rw-r--r--arch/x86/include/asm/apic.h4
-rw-r--r--arch/x86/include/asm/apicdef.h1
-rw-r--r--arch/x86/include/asm/cpu.h1
-rw-r--r--arch/x86/include/asm/cpufeature.h13
-rw-r--r--arch/x86/include/asm/dwarf2.h20
-rw-r--r--arch/x86/include/asm/fixmap.h15
-rw-r--r--arch/x86/include/asm/gart.h15
-rw-r--r--arch/x86/include/asm/hpet.h10
-rw-r--r--arch/x86/include/asm/hw_irq.h17
-rw-r--r--arch/x86/include/asm/i387.h185
-rw-r--r--arch/x86/include/asm/i8259.h2
-rw-r--r--arch/x86/include/asm/io.h1
-rw-r--r--arch/x86/include/asm/io_apic.h6
-rw-r--r--arch/x86/include/asm/irq_remapping.h35
-rw-r--r--arch/x86/include/asm/mrst.h10
-rw-r--r--arch/x86/include/asm/mwait.h15
-rw-r--r--arch/x86/include/asm/olpc_ofw.h4
-rw-r--r--arch/x86/include/asm/page_types.h2
-rw-r--r--arch/x86/include/asm/paravirt.h5
-rw-r--r--arch/x86/include/asm/paravirt_types.h1
-rw-r--r--arch/x86/include/asm/pgtable.h4
-rw-r--r--arch/x86/include/asm/pgtable_64.h2
-rw-r--r--arch/x86/include/asm/processor.h29
-rw-r--r--arch/x86/include/asm/setup.h5
-rw-r--r--arch/x86/include/asm/vmi.h269
-rw-r--r--arch/x86/include/asm/vmi_time.h98
-rw-r--r--arch/x86/kernel/Makefile6
-rw-r--r--arch/x86/kernel/acpi/cstate.c11
-rw-r--r--arch/x86/kernel/amd_iommu.c2
-rw-r--r--arch/x86/kernel/amd_iommu_init.c124
-rw-r--r--arch/x86/kernel/amd_nb.c (renamed from arch/x86/kernel/k8.c)56
-rw-r--r--arch/x86/kernel/apb_timer.c60
-rw-r--r--arch/x86/kernel/aperture_64.c20
-rw-r--r--arch/x86/kernel/apic/apic.c91
-rw-r--r--arch/x86/kernel/apic/io_apic.c882
-rw-r--r--arch/x86/kernel/apic/nmi.c2
-rw-r--r--arch/x86/kernel/apic/probe_64.c3
-rw-r--r--arch/x86/kernel/cpu/amd.c77
-rw-r--r--arch/x86/kernel/cpu/common.c24
-rw-r--r--arch/x86/kernel/cpu/cpu.h1
-rw-r--r--arch/x86/kernel/cpu/intel.c2
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c14
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c27
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c128
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c9
-rw-r--r--arch/x86/kernel/cpu/scattered.c6
-rw-r--r--arch/x86/kernel/crash_dump_64.c3
-rw-r--r--arch/x86/kernel/early-quirks.c2
-rw-r--r--arch/x86/kernel/early_printk.c13
-rw-r--r--arch/x86/kernel/early_printk_mrst.c319
-rw-r--r--arch/x86/kernel/entry_32.S310
-rw-r--r--arch/x86/kernel/entry_64.S108
-rw-r--r--arch/x86/kernel/hpet.c16
-rw-r--r--arch/x86/kernel/i387.c58
-rw-r--r--arch/x86/kernel/i8259.c63
-rw-r--r--arch/x86/kernel/irq.c24
-rw-r--r--arch/x86/kernel/irqinit.c17
-rw-r--r--arch/x86/kernel/machine_kexec_64.c4
-rw-r--r--arch/x86/kernel/olpc-xo1.c140
-rw-r--r--arch/x86/kernel/olpc.c89
-rw-r--r--arch/x86/kernel/olpc_ofw.c6
-rw-r--r--arch/x86/kernel/paravirt.c1
-rw-r--r--arch/x86/kernel/pci-gart_64.c31
-rw-r--r--arch/x86/kernel/pmtimer_64.c69
-rw-r--r--arch/x86/kernel/process_64.c2
-rw-r--r--arch/x86/kernel/reboot.c2
-rw-r--r--arch/x86/kernel/setup.c121
-rw-r--r--arch/x86/kernel/setup_percpu.c2
-rw-r--r--arch/x86/kernel/sfi.c4
-rw-r--r--arch/x86/kernel/smpboot.c118
-rw-r--r--arch/x86/kernel/sys_i386_32.c4
-rw-r--r--arch/x86/kernel/traps.c35
-rw-r--r--arch/x86/kernel/tsc.c66
-rw-r--r--arch/x86/kernel/uv_irq.c55
-rw-r--r--arch/x86/kernel/visws_quirks.c140
-rw-r--r--arch/x86/kernel/vmi_32.c893
-rw-r--r--arch/x86/kernel/vmiclock_32.c317
-rw-r--r--arch/x86/kvm/lapic.c3
-rw-r--r--arch/x86/kvm/x86.c7
-rw-r--r--arch/x86/lguest/boot.c18
-rw-r--r--arch/x86/lib/memcpy_32.c199
-rw-r--r--arch/x86/lib/memcpy_64.S158
-rw-r--r--arch/x86/lib/memmove_64.c189
-rw-r--r--arch/x86/mm/fault.c43
-rw-r--r--arch/x86/mm/init_32.c4
-rw-r--r--arch/x86/mm/init_64.c49
-rw-r--r--arch/x86/mm/k8topology_64.c8
-rw-r--r--arch/x86/mm/kmemcheck/opcode.c2
-rw-r--r--arch/x86/mm/numa_64.c2
-rw-r--r--arch/x86/mm/pgtable.c24
-rw-r--r--arch/x86/mm/tlb.c48
-rw-r--r--arch/x86/oprofile/op_model_amd.c145
-rw-r--r--arch/x86/pci/olpc.c2
-rw-r--r--arch/x86/xen/mmu.c1
-rw-r--r--arch/xtensa/kernel/irq.c2
-rw-r--r--drivers/acpi/acpi_pad.c7
-rw-r--r--drivers/base/topology.c16
-rw-r--r--drivers/block/Kconfig17
-rw-r--r--drivers/block/Makefile1
-rw-r--r--drivers/block/rbd.c1841
-rw-r--r--drivers/block/rbd_types.h73
-rw-r--r--drivers/block/virtio_blk.c17
-rw-r--r--drivers/char/agp/Kconfig2
-rw-r--r--drivers/char/agp/amd64-agp.c39
-rw-r--r--drivers/char/agp/generic.c4
-rw-r--r--drivers/char/tpm/tpm.c22
-rw-r--r--drivers/char/virtio_console.c240
-rw-r--r--drivers/edac/Kconfig16
-rw-r--r--drivers/edac/Makefile3
-rw-r--r--drivers/edac/amd64_edac.c17
-rw-r--r--drivers/edac/amd64_edac.h5
-rw-r--r--drivers/edac/amd64_edac_dbg.c207
-rw-r--r--drivers/edac/edac_device_sysfs.c16
-rw-r--r--drivers/edac/edac_mc_sysfs.c11
-rw-r--r--drivers/edac/edac_mce_amd.c452
-rw-r--r--drivers/edac/edac_module.c79
-rw-r--r--drivers/edac/edac_module.h1
-rw-r--r--drivers/edac/edac_pci_sysfs.c10
-rw-r--r--drivers/edac/edac_stub.c51
-rw-r--r--drivers/edac/mce_amd.c680
-rw-r--r--drivers/edac/mce_amd.h (renamed from drivers/edac/edac_mce_amd.h)59
-rw-r--r--drivers/edac/mce_amd_inj.c171
-rw-r--r--drivers/firmware/Kconfig2
-rw-r--r--drivers/idle/intel_idle.c9
-rw-r--r--drivers/input/evdev.c2
-rw-r--r--drivers/input/misc/hp_sdc_rtc.c4
-rw-r--r--drivers/input/serio/hil_mlc.c6
-rw-r--r--drivers/input/serio/hp_sdc.c4
-rw-r--r--drivers/isdn/act2000/act2000.h6
-rw-r--r--drivers/isdn/hisax/config.c18
-rw-r--r--drivers/isdn/hisax/hisax.h1
-rw-r--r--drivers/macintosh/adb.c2
-rw-r--r--drivers/mfd/twl4030-irq.c4
-rw-r--r--drivers/net/3c527.c2
-rw-r--r--drivers/net/hamradio/6pack.c2
-rw-r--r--drivers/net/hamradio/mkiss.c2
-rw-r--r--drivers/net/irda/sir_dev.c2
-rw-r--r--drivers/net/ppp_async.c2
-rw-r--r--drivers/net/wan/cosa.c2
-rw-r--r--drivers/parport/share.c2
-rw-r--r--drivers/pci/dmar.c8
-rw-r--r--drivers/pci/htirq.c22
-rw-r--r--drivers/pci/intr_remapping.c212
-rw-r--r--drivers/pci/msi.c38
-rw-r--r--drivers/vhost/net.c16
-rw-r--r--drivers/vhost/vhost.c22
-rw-r--r--drivers/vhost/vhost.h10
-rw-r--r--drivers/xen/events.c23
-rw-r--r--fs/affs/super.c4
-rw-r--r--fs/ceph/Kconfig14
-rw-r--r--fs/ceph/Makefile11
-rw-r--r--fs/ceph/README20
-rw-r--r--fs/ceph/addr.c65
-rw-r--r--fs/ceph/caps.c50
-rw-r--r--fs/ceph/ceph_frag.c3
-rw-r--r--fs/ceph/debugfs.c406
-rw-r--r--fs/ceph/dir.c97
-rw-r--r--fs/ceph/export.c5
-rw-r--r--fs/ceph/file.c207
-rw-r--r--fs/ceph/inode.c19
-rw-r--r--fs/ceph/ioctl.c77
-rw-r--r--fs/ceph/ioctl.h4
-rw-r--r--fs/ceph/locks.c23
-rw-r--r--fs/ceph/mds_client.c129
-rw-r--r--fs/ceph/mds_client.h20
-rw-r--r--fs/ceph/mdsmap.c11
-rw-r--r--fs/ceph/pagelist.c63
-rw-r--r--fs/ceph/snap.c10
-rw-r--r--fs/ceph/strings.c (renamed from fs/ceph/ceph_strings.c)82
-rw-r--r--fs/ceph/super.c1154
-rw-r--r--fs/ceph/super.h400
-rw-r--r--fs/ceph/xattr.c18
-rw-r--r--fs/gfs2/Kconfig2
-rw-r--r--fs/gfs2/aops.c24
-rw-r--r--fs/gfs2/bmap.c255
-rw-r--r--fs/gfs2/bmap.h20
-rw-r--r--fs/gfs2/dentry.c2
-rw-r--r--fs/gfs2/dir.c31
-rw-r--r--fs/gfs2/dir.h34
-rw-r--r--fs/gfs2/export.c9
-rw-r--r--fs/gfs2/file.c6
-rw-r--r--fs/gfs2/glock.c23
-rw-r--r--fs/gfs2/glock.h2
-rw-r--r--fs/gfs2/glops.c6
-rw-r--r--fs/gfs2/incore.h8
-rw-r--r--fs/gfs2/inode.c9
-rw-r--r--fs/gfs2/inode.h15
-rw-r--r--fs/gfs2/lock_dlm.c4
-rw-r--r--fs/gfs2/main.c6
-rw-r--r--fs/gfs2/ops_fstype.c79
-rw-r--r--fs/gfs2/ops_inode.c326
-rw-r--r--fs/gfs2/quota.c16
-rw-r--r--fs/gfs2/recovery.c15
-rw-r--r--fs/gfs2/rgrp.c50
-rw-r--r--fs/gfs2/rgrp.h8
-rw-r--r--fs/gfs2/super.c26
-rw-r--r--fs/gfs2/sys.c22
-rw-r--r--fs/gfs2/trace_gfs2.h3
-rw-r--r--fs/gfs2/trans.h9
-rw-r--r--fs/gfs2/xattr.c2
-rw-r--r--fs/hfs/bfind.c4
-rw-r--r--fs/hfs/btree.c2
-rw-r--r--fs/hfs/btree.h2
-rw-r--r--fs/hfsplus/bfind.c17
-rw-r--r--fs/hfsplus/bitmap.c20
-rw-r--r--fs/hfsplus/brec.c29
-rw-r--r--fs/hfsplus/btree.c67
-rw-r--r--fs/hfsplus/catalog.c50
-rw-r--r--fs/hfsplus/dir.c201
-rw-r--r--fs/hfsplus/extents.c223
-rw-r--r--fs/hfsplus/hfsplus_fs.h85
-rw-r--r--fs/hfsplus/hfsplus_raw.h3
-rw-r--r--fs/hfsplus/inode.c185
-rw-r--r--fs/hfsplus/ioctl.c153
-rw-r--r--fs/hfsplus/options.c10
-rw-r--r--fs/hfsplus/part_tbl.c5
-rw-r--r--fs/hfsplus/super.c310
-rw-r--r--fs/hfsplus/unicode.c16
-rw-r--r--fs/hfsplus/wrapper.c40
-rw-r--r--include/asm-generic/pgtable.h4
-rw-r--r--include/asm-generic/vmlinux.lds.h4
-rw-r--r--include/linux/acpi_pmtmr.h2
-rw-r--r--include/linux/ceph/auth.h (renamed from fs/ceph/auth.h)4
-rw-r--r--include/linux/ceph/buffer.h (renamed from fs/ceph/buffer.h)0
-rw-r--r--include/linux/ceph/ceph_debug.h (renamed from fs/ceph/ceph_debug.h)5
-rw-r--r--include/linux/ceph/ceph_frag.h (renamed from fs/ceph/ceph_frag.h)0
-rw-r--r--include/linux/ceph/ceph_fs.h (renamed from fs/ceph/ceph_fs.h)1
-rw-r--r--include/linux/ceph/ceph_hash.h (renamed from fs/ceph/ceph_hash.h)0
-rw-r--r--include/linux/ceph/debugfs.h33
-rw-r--r--include/linux/ceph/decode.h (renamed from fs/ceph/decode.h)5
-rw-r--r--include/linux/ceph/libceph.h249
-rw-r--r--include/linux/ceph/mdsmap.h (renamed from fs/ceph/mdsmap.h)0
-rw-r--r--include/linux/ceph/messenger.h (renamed from fs/ceph/messenger.h)12
-rw-r--r--include/linux/ceph/mon_client.h (renamed from fs/ceph/mon_client.h)1
-rw-r--r--include/linux/ceph/msgpool.h (renamed from fs/ceph/msgpool.h)0
-rw-r--r--include/linux/ceph/msgr.h (renamed from fs/ceph/msgr.h)0
-rw-r--r--include/linux/ceph/osd_client.h (renamed from fs/ceph/osd_client.h)67
-rw-r--r--include/linux/ceph/osdmap.h (renamed from fs/ceph/osdmap.h)4
-rw-r--r--include/linux/ceph/pagelist.h (renamed from fs/ceph/pagelist.h)23
-rw-r--r--include/linux/ceph/rados.h (renamed from fs/ceph/rados.h)0
-rw-r--r--include/linux/ceph/types.h (renamed from fs/ceph/types.h)0
-rw-r--r--include/linux/cgroup.h4
-rw-r--r--include/linux/compiler.h4
-rw-r--r--include/linux/cred.h2
-rw-r--r--include/linux/crush/crush.h (renamed from fs/ceph/crush/crush.h)0
-rw-r--r--include/linux/crush/hash.h (renamed from fs/ceph/crush/hash.h)0
-rw-r--r--include/linux/crush/mapper.h (renamed from fs/ceph/crush/mapper.h)0
-rw-r--r--include/linux/debug_locks.h5
-rw-r--r--include/linux/dmar.h10
-rw-r--r--include/linux/edac.h4
-rw-r--r--include/linux/fdtable.h6
-rw-r--r--include/linux/fs.h2
-rw-r--r--include/linux/genhd.h6
-rw-r--r--include/linux/hardirq.h11
-rw-r--r--include/linux/htirq.h5
-rw-r--r--include/linux/idr.h4
-rw-r--r--include/linux/init_task.h14
-rw-r--r--include/linux/input.h2
-rw-r--r--include/linux/interrupt.h3
-rw-r--r--include/linux/iocontext.h2
-rw-r--r--include/linux/irq.h447
-rw-r--r--include/linux/irqdesc.h159
-rw-r--r--include/linux/irqnr.h5
-rw-r--r--include/linux/kernel.h13
-rw-r--r--include/linux/key.h3
-rw-r--r--include/linux/kvm_host.h2
-rw-r--r--include/linux/lockdep.h21
-rw-r--r--include/linux/mm_types.h2
-rw-r--r--include/linux/msi.h13
-rw-r--r--include/linux/netfilter/nfnetlink_conntrack.h10
-rw-r--r--include/linux/netfilter/xt_SECMARK.h12
-rw-r--r--include/linux/nfs_fs.h2
-rw-r--r--include/linux/notifier.h10
-rw-r--r--include/linux/pci_ids.h1
-rw-r--r--include/linux/percpu-defs.h9
-rw-r--r--include/linux/radix-tree.h4
-rw-r--r--include/linux/rculist.h62
-rw-r--r--include/linux/rculist_nulls.h16
-rw-r--r--include/linux/rcupdate.h490
-rw-r--r--include/linux/rcutiny.h104
-rw-r--r--include/linux/rcutree.h57
-rw-r--r--include/linux/sched.h39
-rw-r--r--include/linux/security.h45
-rw-r--r--include/linux/selinux.h63
-rw-r--r--include/linux/srcu.h34
-rw-r--r--include/linux/sunrpc/auth_gss.h4
-rw-r--r--include/linux/thread_info.h4
-rw-r--r--include/linux/topology.h6
-rw-r--r--include/net/cls_cgroup.h3
-rw-r--r--include/net/netfilter/nf_conntrack.h2
-rw-r--r--include/trace/events/sched.h29
-rw-r--r--init/Kconfig28
-rw-r--r--init/main.c1
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/cgroup.c2
-rw-r--r--kernel/cpuset.c4
-rw-r--r--kernel/futex.c66
-rw-r--r--kernel/futex_compat.c2
-rw-r--r--kernel/hung_task.c4
-rw-r--r--kernel/irq/Kconfig53
-rw-r--r--kernel/irq/Makefile3
-rw-r--r--kernel/irq/autoprobe.c15
-rw-r--r--kernel/irq/chip.c378
-rw-r--r--kernel/irq/dummychip.c68
-rw-r--r--kernel/irq/handle.c341
-rw-r--r--kernel/irq/internals.h39
-rw-r--r--kernel/irq/irqdesc.c395
-rw-r--r--kernel/irq/manage.c87
-rw-r--r--kernel/irq/migration.c12
-rw-r--r--kernel/irq/numa_migrate.c120
-rw-r--r--kernel/irq/proc.c26
-rw-r--r--kernel/irq/resend.c5
-rw-r--r--kernel/irq/spurious.c8
-rw-r--r--kernel/lockdep.c51
-rw-r--r--kernel/pid.c3
-rw-r--r--kernel/printk.c4
-rw-r--r--kernel/rcupdate.c8
-rw-r--r--kernel/rcutiny.c33
-rw-r--r--kernel/rcutiny_plugin.h582
-rw-r--r--kernel/rcutorture.c17
-rw-r--r--kernel/rcutree.c92
-rw-r--r--kernel/rcutree.h20
-rw-r--r--kernel/rcutree_plugin.h47
-rw-r--r--kernel/rcutree_trace.c12
-rw-r--r--kernel/sched.c307
-rw-r--r--kernel/sched_fair.c81
-rw-r--r--kernel/sched_features.h5
-rw-r--r--kernel/sched_rt.c40
-rw-r--r--kernel/sched_stoptask.c108
-rw-r--r--kernel/softirq.c73
-rw-r--r--kernel/srcu.c2
-rw-r--r--kernel/stop_machine.c8
-rw-r--r--kernel/time/ntp.c14
-rw-r--r--kernel/trace/ftrace.c4
-rw-r--r--kernel/trace/trace.c2
-rw-r--r--kernel/trace/trace.h4
-rw-r--r--kernel/trace/trace_functions_graph.c86
-rw-r--r--kernel/trace/trace_irqsoff.c152
-rw-r--r--kernel/trace/trace_sched_wakeup.c256
-rw-r--r--kernel/watchdog.c2
-rw-r--r--lib/Kconfig.debug41
-rw-r--r--lib/radix-tree.c2
-rw-r--r--lib/swiotlb.c18
-rw-r--r--mm/memory.c2
-rw-r--r--mm/vmalloc.c9
-rw-r--r--net/Kconfig1
-rw-r--r--net/Makefile1
-rw-r--r--net/ceph/Kconfig28
-rw-r--r--net/ceph/Makefile37
-rw-r--r--net/ceph/armor.c (renamed from fs/ceph/armor.c)0
-rw-r--r--net/ceph/auth.c (renamed from fs/ceph/auth.c)10
-rw-r--r--net/ceph/auth_none.c (renamed from fs/ceph/auth_none.c)7
-rw-r--r--net/ceph/auth_none.h (renamed from fs/ceph/auth_none.h)3
-rw-r--r--net/ceph/auth_x.c (renamed from fs/ceph/auth_x.c)9
-rw-r--r--net/ceph/auth_x.h (renamed from fs/ceph/auth_x.h)3
-rw-r--r--net/ceph/auth_x_protocol.h (renamed from fs/ceph/auth_x_protocol.h)0
-rw-r--r--net/ceph/buffer.c (renamed from fs/ceph/buffer.c)9
-rw-r--r--net/ceph/ceph_common.c529
-rw-r--r--net/ceph/ceph_fs.c (renamed from fs/ceph/ceph_fs.c)5
-rw-r--r--net/ceph/ceph_hash.c (renamed from fs/ceph/ceph_hash.c)2
-rw-r--r--net/ceph/ceph_strings.c84
-rw-r--r--net/ceph/crush/crush.c (renamed from fs/ceph/crush/crush.c)2
-rw-r--r--net/ceph/crush/hash.c (renamed from fs/ceph/crush/hash.c)2
-rw-r--r--net/ceph/crush/mapper.c (renamed from fs/ceph/crush/mapper.c)4
-rw-r--r--net/ceph/crypto.c (renamed from fs/ceph/crypto.c)4
-rw-r--r--net/ceph/crypto.h (renamed from fs/ceph/crypto.h)4
-rw-r--r--net/ceph/debugfs.c267
-rw-r--r--net/ceph/messenger.c (renamed from fs/ceph/messenger.c)296
-rw-r--r--net/ceph/mon_client.c (renamed from fs/ceph/mon_client.c)73
-rw-r--r--net/ceph/msgpool.c (renamed from fs/ceph/msgpool.c)4
-rw-r--r--net/ceph/osd_client.c (renamed from fs/ceph/osd_client.c)400
-rw-r--r--net/ceph/osdmap.c (renamed from fs/ceph/osdmap.c)30
-rw-r--r--net/ceph/pagelist.c154
-rw-r--r--net/ceph/pagevec.c223
-rw-r--r--net/core/sock.c5
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c28
-rw-r--r--net/ipv4/netfilter/nf_nat_core.c2
-rw-r--r--net/netfilter/core.c2
-rw-r--r--net/netfilter/nf_conntrack_ecache.c4
-rw-r--r--net/netfilter/nf_conntrack_extend.c2
-rw-r--r--net/netfilter/nf_conntrack_netlink.c44
-rw-r--r--net/netfilter/nf_conntrack_proto.c4
-rw-r--r--net/netfilter/nf_conntrack_standalone.c28
-rw-r--r--net/netfilter/nf_log.c2
-rw-r--r--net/netfilter/nf_queue.c2
-rw-r--r--net/netfilter/xt_CT.c1
-rw-r--r--net/netfilter/xt_SECMARK.c35
-rw-r--r--net/sched/cls_cgroup.c2
-rw-r--r--security/apparmor/.gitignore1
-rw-r--r--security/apparmor/apparmorfs.c4
-rw-r--r--security/capability.c17
-rw-r--r--security/commoncap.c5
-rw-r--r--security/security.c35
-rw-r--r--security/selinux/Makefile21
-rw-r--r--security/selinux/exports.c49
-rw-r--r--security/selinux/hooks.c28
-rw-r--r--security/selinux/include/classmap.h2
-rw-r--r--security/selinux/include/security.h23
-rw-r--r--security/selinux/selinuxfs.c195
-rw-r--r--security/selinux/ss/Makefile9
-rw-r--r--security/selinux/ss/avtab.c46
-rw-r--r--security/selinux/ss/avtab.h3
-rw-r--r--security/selinux/ss/conditional.c123
-rw-r--r--security/selinux/ss/conditional.h2
-rw-r--r--security/selinux/ss/ebitmap.c81
-rw-r--r--security/selinux/ss/ebitmap.h1
-rw-r--r--security/selinux/ss/policydb.c861
-rw-r--r--security/selinux/ss/policydb.h20
-rw-r--r--security/selinux/ss/services.c63
-rw-r--r--security/selinux/ss/status.c126
-rw-r--r--security/smack/smack_lsm.c8
-rw-r--r--security/tomoyo/common.c17
465 files changed, 16810 insertions, 10045 deletions
diff --git a/Documentation/DocBook/genericirq.tmpl b/Documentation/DocBook/genericirq.tmpl
index 1448b33fd222..fb10fd08c05c 100644
--- a/Documentation/DocBook/genericirq.tmpl
+++ b/Documentation/DocBook/genericirq.tmpl
@@ -28,7 +28,7 @@
28 </authorgroup> 28 </authorgroup>
29 29
30 <copyright> 30 <copyright>
31 <year>2005-2006</year> 31 <year>2005-2010</year>
32 <holder>Thomas Gleixner</holder> 32 <holder>Thomas Gleixner</holder>
33 </copyright> 33 </copyright>
34 <copyright> 34 <copyright>
@@ -100,6 +100,10 @@
100 <listitem><para>Edge type</para></listitem> 100 <listitem><para>Edge type</para></listitem>
101 <listitem><para>Simple type</para></listitem> 101 <listitem><para>Simple type</para></listitem>
102 </itemizedlist> 102 </itemizedlist>
103 During the implementation we identified another type:
104 <itemizedlist>
105 <listitem><para>Fast EOI type</para></listitem>
106 </itemizedlist>
103 In the SMP world of the __do_IRQ() super-handler another type 107 In the SMP world of the __do_IRQ() super-handler another type
104 was identified: 108 was identified:
105 <itemizedlist> 109 <itemizedlist>
@@ -153,6 +157,7 @@
153 is still available. This leads to a kind of duality for the time 157 is still available. This leads to a kind of duality for the time
154 being. Over time the new model should be used in more and more 158 being. Over time the new model should be used in more and more
155 architectures, as it enables smaller and cleaner IRQ subsystems. 159 architectures, as it enables smaller and cleaner IRQ subsystems.
160 It's deprecated for three years now and about to be removed.
156 </para> 161 </para>
157 </chapter> 162 </chapter>
158 <chapter id="bugs"> 163 <chapter id="bugs">
@@ -217,6 +222,7 @@
217 <itemizedlist> 222 <itemizedlist>
218 <listitem><para>handle_level_irq</para></listitem> 223 <listitem><para>handle_level_irq</para></listitem>
219 <listitem><para>handle_edge_irq</para></listitem> 224 <listitem><para>handle_edge_irq</para></listitem>
225 <listitem><para>handle_fasteoi_irq</para></listitem>
220 <listitem><para>handle_simple_irq</para></listitem> 226 <listitem><para>handle_simple_irq</para></listitem>
221 <listitem><para>handle_percpu_irq</para></listitem> 227 <listitem><para>handle_percpu_irq</para></listitem>
222 </itemizedlist> 228 </itemizedlist>
@@ -233,33 +239,33 @@
233 are used by the default flow implementations. 239 are used by the default flow implementations.
234 The following helper functions are implemented (simplified excerpt): 240 The following helper functions are implemented (simplified excerpt):
235 <programlisting> 241 <programlisting>
236default_enable(irq) 242default_enable(struct irq_data *data)
237{ 243{
238 desc->chip->unmask(irq); 244 desc->chip->irq_unmask(data);
239} 245}
240 246
241default_disable(irq) 247default_disable(struct irq_data *data)
242{ 248{
243 if (!delay_disable(irq)) 249 if (!delay_disable(data))
244 desc->chip->mask(irq); 250 desc->chip->irq_mask(data);
245} 251}
246 252
247default_ack(irq) 253default_ack(struct irq_data *data)
248{ 254{
249 chip->ack(irq); 255 chip->irq_ack(data);
250} 256}
251 257
252default_mask_ack(irq) 258default_mask_ack(struct irq_data *data)
253{ 259{
254 if (chip->mask_ack) { 260 if (chip->irq_mask_ack) {
255 chip->mask_ack(irq); 261 chip->irq_mask_ack(data);
256 } else { 262 } else {
257 chip->mask(irq); 263 chip->irq_mask(data);
258 chip->ack(irq); 264 chip->irq_ack(data);
259 } 265 }
260} 266}
261 267
262noop(irq) 268noop(struct irq_data *data))
263{ 269{
264} 270}
265 271
@@ -278,12 +284,27 @@ noop(irq)
278 <para> 284 <para>
279 The following control flow is implemented (simplified excerpt): 285 The following control flow is implemented (simplified excerpt):
280 <programlisting> 286 <programlisting>
281desc->chip->start(); 287desc->chip->irq_mask();
282handle_IRQ_event(desc->action); 288handle_IRQ_event(desc->action);
283desc->chip->end(); 289desc->chip->irq_unmask();
284 </programlisting> 290 </programlisting>
285 </para> 291 </para>
286 </sect3> 292 </sect3>
293 <sect3 id="Default_FASTEOI_IRQ_flow_handler">
294 <title>Default Fast EOI IRQ flow handler</title>
295 <para>
296 handle_fasteoi_irq provides a generic implementation
297 for interrupts, which only need an EOI at the end of
298 the handler
299 </para>
300 <para>
301 The following control flow is implemented (simplified excerpt):
302 <programlisting>
303handle_IRQ_event(desc->action);
304desc->chip->irq_eoi();
305 </programlisting>
306 </para>
307 </sect3>
287 <sect3 id="Default_Edge_IRQ_flow_handler"> 308 <sect3 id="Default_Edge_IRQ_flow_handler">
288 <title>Default Edge IRQ flow handler</title> 309 <title>Default Edge IRQ flow handler</title>
289 <para> 310 <para>
@@ -294,20 +315,19 @@ desc->chip->end();
294 The following control flow is implemented (simplified excerpt): 315 The following control flow is implemented (simplified excerpt):
295 <programlisting> 316 <programlisting>
296if (desc->status &amp; running) { 317if (desc->status &amp; running) {
297 desc->chip->hold(); 318 desc->chip->irq_mask();
298 desc->status |= pending | masked; 319 desc->status |= pending | masked;
299 return; 320 return;
300} 321}
301desc->chip->start(); 322desc->chip->irq_ack();
302desc->status |= running; 323desc->status |= running;
303do { 324do {
304 if (desc->status &amp; masked) 325 if (desc->status &amp; masked)
305 desc->chip->enable(); 326 desc->chip->irq_unmask();
306 desc->status &amp;= ~pending; 327 desc->status &amp;= ~pending;
307 handle_IRQ_event(desc->action); 328 handle_IRQ_event(desc->action);
308} while (status &amp; pending); 329} while (status &amp; pending);
309desc->status &amp;= ~running; 330desc->status &amp;= ~running;
310desc->chip->end();
311 </programlisting> 331 </programlisting>
312 </para> 332 </para>
313 </sect3> 333 </sect3>
@@ -342,9 +362,9 @@ handle_IRQ_event(desc->action);
342 <para> 362 <para>
343 The following control flow is implemented (simplified excerpt): 363 The following control flow is implemented (simplified excerpt):
344 <programlisting> 364 <programlisting>
345desc->chip->start();
346handle_IRQ_event(desc->action); 365handle_IRQ_event(desc->action);
347desc->chip->end(); 366if (desc->chip->irq_eoi)
367 desc->chip->irq_eoi();
348 </programlisting> 368 </programlisting>
349 </para> 369 </para>
350 </sect3> 370 </sect3>
@@ -375,8 +395,7 @@ desc->chip->end();
375 mechanism. (It's necessary to enable CONFIG_HARDIRQS_SW_RESEND when 395 mechanism. (It's necessary to enable CONFIG_HARDIRQS_SW_RESEND when
376 you want to use the delayed interrupt disable feature and your 396 you want to use the delayed interrupt disable feature and your
377 hardware is not capable of retriggering an interrupt.) 397 hardware is not capable of retriggering an interrupt.)
378 The delayed interrupt disable can be runtime enabled, per interrupt, 398 The delayed interrupt disable is not configurable.
379 by setting the IRQ_DELAYED_DISABLE flag in the irq_desc status field.
380 </para> 399 </para>
381 </sect2> 400 </sect2>
382 </sect1> 401 </sect1>
@@ -387,13 +406,13 @@ desc->chip->end();
387 contains all the direct chip relevant functions, which 406 contains all the direct chip relevant functions, which
388 can be utilized by the irq flow implementations. 407 can be utilized by the irq flow implementations.
389 <itemizedlist> 408 <itemizedlist>
390 <listitem><para>ack()</para></listitem> 409 <listitem><para>irq_ack()</para></listitem>
391 <listitem><para>mask_ack() - Optional, recommended for performance</para></listitem> 410 <listitem><para>irq_mask_ack() - Optional, recommended for performance</para></listitem>
392 <listitem><para>mask()</para></listitem> 411 <listitem><para>irq_mask()</para></listitem>
393 <listitem><para>unmask()</para></listitem> 412 <listitem><para>irq_unmask()</para></listitem>
394 <listitem><para>retrigger() - Optional</para></listitem> 413 <listitem><para>irq_retrigger() - Optional</para></listitem>
395 <listitem><para>set_type() - Optional</para></listitem> 414 <listitem><para>irq_set_type() - Optional</para></listitem>
396 <listitem><para>set_wake() - Optional</para></listitem> 415 <listitem><para>irq_set_wake() - Optional</para></listitem>
397 </itemizedlist> 416 </itemizedlist>
398 These primitives are strictly intended to mean what they say: ack means 417 These primitives are strictly intended to mean what they say: ack means
399 ACK, masking means masking of an IRQ line, etc. It is up to the flow 418 ACK, masking means masking of an IRQ line, etc. It is up to the flow
@@ -458,6 +477,7 @@ desc->chip->end();
458 <para> 477 <para>
459 This chapter contains the autogenerated documentation of the internal functions. 478 This chapter contains the autogenerated documentation of the internal functions.
460 </para> 479 </para>
480!Ikernel/irq/irqdesc.c
461!Ikernel/irq/handle.c 481!Ikernel/irq/handle.c
462!Ikernel/irq/chip.c 482!Ikernel/irq/chip.c
463 </chapter> 483 </chapter>
diff --git a/Documentation/DocBook/kernel-locking.tmpl b/Documentation/DocBook/kernel-locking.tmpl
index a0d479d1e1dd..f66f4df18690 100644
--- a/Documentation/DocBook/kernel-locking.tmpl
+++ b/Documentation/DocBook/kernel-locking.tmpl
@@ -1645,7 +1645,9 @@ the amount of locking which needs to be done.
1645 all the readers who were traversing the list when we deleted the 1645 all the readers who were traversing the list when we deleted the
1646 element are finished. We use <function>call_rcu()</function> to 1646 element are finished. We use <function>call_rcu()</function> to
1647 register a callback which will actually destroy the object once 1647 register a callback which will actually destroy the object once
1648 the readers are finished. 1648 all pre-existing readers are finished. Alternatively,
1649 <function>synchronize_rcu()</function> may be used to block until
1650 all pre-existing are finished.
1649 </para> 1651 </para>
1650 <para> 1652 <para>
1651 But how does Read Copy Update know when the readers are 1653 But how does Read Copy Update know when the readers are
@@ -1714,7 +1716,7 @@ the amount of locking which needs to be done.
1714- object_put(obj); 1716- object_put(obj);
1715+ list_del_rcu(&amp;obj-&gt;list); 1717+ list_del_rcu(&amp;obj-&gt;list);
1716 cache_num--; 1718 cache_num--;
1717+ call_rcu(&amp;obj-&gt;rcu, cache_delete_rcu, obj); 1719+ call_rcu(&amp;obj-&gt;rcu, cache_delete_rcu);
1718 } 1720 }
1719 1721
1720 /* Must be holding cache_lock */ 1722 /* Must be holding cache_lock */
@@ -1725,14 +1727,6 @@ the amount of locking which needs to be done.
1725 if (++cache_num > MAX_CACHE_SIZE) { 1727 if (++cache_num > MAX_CACHE_SIZE) {
1726 struct object *i, *outcast = NULL; 1728 struct object *i, *outcast = NULL;
1727 list_for_each_entry(i, &amp;cache, list) { 1729 list_for_each_entry(i, &amp;cache, list) {
1728@@ -85,6 +94,7 @@
1729 obj-&gt;popularity = 0;
1730 atomic_set(&amp;obj-&gt;refcnt, 1); /* The cache holds a reference */
1731 spin_lock_init(&amp;obj-&gt;lock);
1732+ INIT_RCU_HEAD(&amp;obj-&gt;rcu);
1733
1734 spin_lock_irqsave(&amp;cache_lock, flags);
1735 __cache_add(obj);
1736@@ -104,12 +114,11 @@ 1730@@ -104,12 +114,11 @@
1737 struct object *cache_find(int id) 1731 struct object *cache_find(int id)
1738 { 1732 {
diff --git a/Documentation/RCU/checklist.txt b/Documentation/RCU/checklist.txt
index 790d1a812376..0c134f8afc6f 100644
--- a/Documentation/RCU/checklist.txt
+++ b/Documentation/RCU/checklist.txt
@@ -218,13 +218,22 @@ over a rather long period of time, but improvements are always welcome!
218 include: 218 include:
219 219
220 a. Keeping a count of the number of data-structure elements 220 a. Keeping a count of the number of data-structure elements
221 used by the RCU-protected data structure, including those 221 used by the RCU-protected data structure, including
222 waiting for a grace period to elapse. Enforce a limit 222 those waiting for a grace period to elapse. Enforce a
223 on this number, stalling updates as needed to allow 223 limit on this number, stalling updates as needed to allow
224 previously deferred frees to complete. 224 previously deferred frees to complete. Alternatively,
225 225 limit only the number awaiting deferred free rather than
226 Alternatively, limit only the number awaiting deferred 226 the total number of elements.
227 free rather than the total number of elements. 227
228 One way to stall the updates is to acquire the update-side
229 mutex. (Don't try this with a spinlock -- other CPUs
230 spinning on the lock could prevent the grace period
231 from ever ending.) Another way to stall the updates
232 is for the updates to use a wrapper function around
233 the memory allocator, so that this wrapper function
234 simulates OOM when there is too much memory awaiting an
235 RCU grace period. There are of course many other
236 variations on this theme.
228 237
229 b. Limiting update rate. For example, if updates occur only 238 b. Limiting update rate. For example, if updates occur only
230 once per hour, then no explicit rate limiting is required, 239 once per hour, then no explicit rate limiting is required,
@@ -365,3 +374,26 @@ over a rather long period of time, but improvements are always welcome!
365 and the compiler to freely reorder code into and out of RCU 374 and the compiler to freely reorder code into and out of RCU
366 read-side critical sections. It is the responsibility of the 375 read-side critical sections. It is the responsibility of the
367 RCU update-side primitives to deal with this. 376 RCU update-side primitives to deal with this.
377
37817. Use CONFIG_PROVE_RCU, CONFIG_DEBUG_OBJECTS_RCU_HEAD, and
379 the __rcu sparse checks to validate your RCU code. These
380 can help find problems as follows:
381
382 CONFIG_PROVE_RCU: check that accesses to RCU-protected data
383 structures are carried out under the proper RCU
384 read-side critical section, while holding the right
385 combination of locks, or whatever other conditions
386 are appropriate.
387
388 CONFIG_DEBUG_OBJECTS_RCU_HEAD: check that you don't pass the
389 same object to call_rcu() (or friends) before an RCU
390 grace period has elapsed since the last time that you
391 passed that same object to call_rcu() (or friends).
392
393 __rcu sparse checks: tag the pointer to the RCU-protected data
394 structure with __rcu, and sparse will warn you if you
395 access that pointer without the services of one of the
396 variants of rcu_dereference().
397
398 These debugging aids can help you find problems that are
399 otherwise extremely difficult to spot.
diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt
index 44c6dcc93d6d..862c08ef1fde 100644
--- a/Documentation/RCU/stallwarn.txt
+++ b/Documentation/RCU/stallwarn.txt
@@ -80,6 +80,24 @@ o A CPU looping with bottom halves disabled. This condition can
80o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel 80o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel
81 without invoking schedule(). 81 without invoking schedule().
82 82
83o A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might
84 happen to preempt a low-priority task in the middle of an RCU
85 read-side critical section. This is especially damaging if
86 that low-priority task is not permitted to run on any other CPU,
87 in which case the next RCU grace period can never complete, which
88 will eventually cause the system to run out of memory and hang.
89 While the system is in the process of running itself out of
90 memory, you might see stall-warning messages.
91
92o A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that
93 is running at a higher priority than the RCU softirq threads.
94 This will prevent RCU callbacks from ever being invoked,
95 and in a CONFIG_TREE_PREEMPT_RCU kernel will further prevent
96 RCU grace periods from ever completing. Either way, the
97 system will eventually run out of memory and hang. In the
98 CONFIG_TREE_PREEMPT_RCU case, you might see stall-warning
99 messages.
100
83o A bug in the RCU implementation. 101o A bug in the RCU implementation.
84 102
85o A hardware failure. This is quite unlikely, but has occurred 103o A hardware failure. This is quite unlikely, but has occurred
diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
index efd8cc95c06b..a851118775d8 100644
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt
@@ -125,6 +125,17 @@ o "b" is the batch limit for this CPU. If more than this number
125 of RCU callbacks is ready to invoke, then the remainder will 125 of RCU callbacks is ready to invoke, then the remainder will
126 be deferred. 126 be deferred.
127 127
128o "ci" is the number of RCU callbacks that have been invoked for
129 this CPU. Note that ci+ql is the number of callbacks that have
130 been registered in absence of CPU-hotplug activity.
131
132o "co" is the number of RCU callbacks that have been orphaned due to
133 this CPU going offline.
134
135o "ca" is the number of RCU callbacks that have been adopted due to
136 other CPUs going offline. Note that ci+co-ca+ql is the number of
137 RCU callbacks registered on this CPU.
138
128There is also an rcu/rcudata.csv file with the same information in 139There is also an rcu/rcudata.csv file with the same information in
129comma-separated-variable spreadsheet format. 140comma-separated-variable spreadsheet format.
130 141
@@ -180,7 +191,7 @@ o "s" is the "signaled" state that drives force_quiescent_state()'s
180 191
181o "jfq" is the number of jiffies remaining for this grace period 192o "jfq" is the number of jiffies remaining for this grace period
182 before force_quiescent_state() is invoked to help push things 193 before force_quiescent_state() is invoked to help push things
183 along. Note that CPUs in dyntick-idle mode thoughout the grace 194 along. Note that CPUs in dyntick-idle mode throughout the grace
184 period will not report on their own, but rather must be check by 195 period will not report on their own, but rather must be check by
185 some other CPU via force_quiescent_state(). 196 some other CPU via force_quiescent_state().
186 197
diff --git a/Documentation/cputopology.txt b/Documentation/cputopology.txt
index f1c5c4bccd3e..902d3151f527 100644
--- a/Documentation/cputopology.txt
+++ b/Documentation/cputopology.txt
@@ -14,25 +14,39 @@ to /proc/cpuinfo.
14 identifier (rather than the kernel's). The actual value is 14 identifier (rather than the kernel's). The actual value is
15 architecture and platform dependent. 15 architecture and platform dependent.
16 16
173) /sys/devices/system/cpu/cpuX/topology/thread_siblings: 173) /sys/devices/system/cpu/cpuX/topology/book_id:
18
19 the book ID of cpuX. Typically it is the hardware platform's
20 identifier (rather than the kernel's). The actual value is
21 architecture and platform dependent.
22
234) /sys/devices/system/cpu/cpuX/topology/thread_siblings:
18 24
19 internel kernel map of cpuX's hardware threads within the same 25 internel kernel map of cpuX's hardware threads within the same
20 core as cpuX 26 core as cpuX
21 27
224) /sys/devices/system/cpu/cpuX/topology/core_siblings: 285) /sys/devices/system/cpu/cpuX/topology/core_siblings:
23 29
24 internal kernel map of cpuX's hardware threads within the same 30 internal kernel map of cpuX's hardware threads within the same
25 physical_package_id. 31 physical_package_id.
26 32
336) /sys/devices/system/cpu/cpuX/topology/book_siblings:
34
35 internal kernel map of cpuX's hardware threads within the same
36 book_id.
37
27To implement it in an architecture-neutral way, a new source file, 38To implement it in an architecture-neutral way, a new source file,
28drivers/base/topology.c, is to export the 4 attributes. 39drivers/base/topology.c, is to export the 4 or 6 attributes. The two book
40related sysfs files will only be created if CONFIG_SCHED_BOOK is selected.
29 41
30For an architecture to support this feature, it must define some of 42For an architecture to support this feature, it must define some of
31these macros in include/asm-XXX/topology.h: 43these macros in include/asm-XXX/topology.h:
32#define topology_physical_package_id(cpu) 44#define topology_physical_package_id(cpu)
33#define topology_core_id(cpu) 45#define topology_core_id(cpu)
46#define topology_book_id(cpu)
34#define topology_thread_cpumask(cpu) 47#define topology_thread_cpumask(cpu)
35#define topology_core_cpumask(cpu) 48#define topology_core_cpumask(cpu)
49#define topology_book_cpumask(cpu)
36 50
37The type of **_id is int. 51The type of **_id is int.
38The type of siblings is (const) struct cpumask *. 52The type of siblings is (const) struct cpumask *.
@@ -45,6 +59,9 @@ not defined by include/asm-XXX/topology.h:
453) thread_siblings: just the given CPU 593) thread_siblings: just the given CPU
464) core_siblings: just the given CPU 604) core_siblings: just the given CPU
47 61
62For architectures that don't support books (CONFIG_SCHED_BOOK) there are no
63default definitions for topology_book_id() and topology_book_cpumask().
64
48Additionally, CPU topology information is provided under 65Additionally, CPU topology information is provided under
49/sys/devices/system/cpu and includes these files. The internal 66/sys/devices/system/cpu and includes these files. The internal
50source for the output is in brackets ("[]"). 67source for the output is in brackets ("[]").
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 842aa9de84a6..5e2bc4ab897a 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -386,34 +386,6 @@ Who: Tejun Heo <tj@kernel.org>
386 386
387---------------------------- 387----------------------------
388 388
389What: Support for VMware's guest paravirtuliazation technique [VMI] will be
390 dropped.
391When: 2.6.37 or earlier.
392Why: With the recent innovations in CPU hardware acceleration technologies
393 from Intel and AMD, VMware ran a few experiments to compare these
394 techniques to guest paravirtualization technique on VMware's platform.
395 These hardware assisted virtualization techniques have outperformed the
396 performance benefits provided by VMI in most of the workloads. VMware
397 expects that these hardware features will be ubiquitous in a couple of
398 years, as a result, VMware has started a phased retirement of this
399 feature from the hypervisor. We will be removing this feature from the
400 Kernel too. Right now we are targeting 2.6.37 but can retire earlier if
401 technical reasons (read opportunity to remove major chunk of pvops)
402 arise.
403
404 Please note that VMI has always been an optimization and non-VMI kernels
405 still work fine on VMware's platform.
406 Latest versions of VMware's product which support VMI are,
407 Workstation 7.0 and VSphere 4.0 on ESX side, future maintainence
408 releases for these products will continue supporting VMI.
409
410 For more details about VMI retirement take a look at this,
411 http://blogs.vmware.com/guestosguide/2009/09/vmi-retirement.html
412
413Who: Alok N Kataria <akataria@vmware.com>
414
415----------------------------
416
417What: Support for lcd_switch and display_get in asus-laptop driver 389What: Support for lcd_switch and display_get in asus-laptop driver
418When: March 2010 390When: March 2010
419Why: These two features use non-standard interfaces. There are the 391Why: These two features use non-standard interfaces. There are the
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 8dd7248508a9..3a0009e03d14 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -455,7 +455,7 @@ and is between 256 and 4096 characters. It is defined in the file
455 [ARM] imx_timer1,OSTS,netx_timer,mpu_timer2, 455 [ARM] imx_timer1,OSTS,netx_timer,mpu_timer2,
456 pxa_timer,timer3,32k_counter,timer0_1 456 pxa_timer,timer3,32k_counter,timer0_1
457 [AVR32] avr32 457 [AVR32] avr32
458 [X86-32] pit,hpet,tsc,vmi-timer; 458 [X86-32] pit,hpet,tsc;
459 scx200_hrt on Geode; cyclone on IBM x440 459 scx200_hrt on Geode; cyclone on IBM x440
460 [MIPS] MIPS 460 [MIPS] MIPS
461 [PARISC] cr16 461 [PARISC] cr16
@@ -2153,6 +2153,11 @@ and is between 256 and 4096 characters. It is defined in the file
2153 Reserves a hole at the top of the kernel virtual 2153 Reserves a hole at the top of the kernel virtual
2154 address space. 2154 address space.
2155 2155
2156 reservelow= [X86]
2157 Format: nn[K]
2158 Set the amount of memory to reserve for BIOS at
2159 the bottom of the address space.
2160
2156 reset_devices [KNL] Force drivers to reset the underlying device 2161 reset_devices [KNL] Force drivers to reset the underlying device
2157 during initialization. 2162 during initialization.
2158 2163
@@ -2435,6 +2440,10 @@ and is between 256 and 4096 characters. It is defined in the file
2435 disables clocksource verification at runtime. 2440 disables clocksource verification at runtime.
2436 Used to enable high-resolution timer mode on older 2441 Used to enable high-resolution timer mode on older
2437 hardware, and in virtualized environment. 2442 hardware, and in virtualized environment.
2443 [x86] noirqtime: Do not use TSC to do irq accounting.
2444 Used to run time disable IRQ_TIME_ACCOUNTING on any
2445 platforms where RDTSC is slow and this accounting
2446 can add overhead.
2438 2447
2439 turbografx.map[2|3]= [HW,JOY] 2448 turbografx.map[2|3]= [HW,JOY]
2440 TurboGraFX parallel port interface 2449 TurboGraFX parallel port interface
diff --git a/MAINTAINERS b/MAINTAINERS
index f2a2b8e647c5..6f5b5b2b528d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1527,6 +1527,8 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git
1527S: Supported 1527S: Supported
1528F: Documentation/filesystems/ceph.txt 1528F: Documentation/filesystems/ceph.txt
1529F: fs/ceph 1529F: fs/ceph
1530F: net/ceph
1531F: include/linux/ceph
1530 1532
1531CERTIFIED WIRELESS USB (WUSB) SUBSYSTEM: 1533CERTIFIED WIRELESS USB (WUSB) SUBSYSTEM:
1532M: David Vrabel <david.vrabel@csr.com> 1534M: David Vrabel <david.vrabel@csr.com>
@@ -3239,6 +3241,12 @@ F: drivers/net/irda/
3239F: include/net/irda/ 3241F: include/net/irda/
3240F: net/irda/ 3242F: net/irda/
3241 3243
3244IRQ SUBSYSTEM
3245M: Thomas Gleixner <tglx@linutronix.de>
3246S: Maintained
3247T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git irq/core
3248F: kernel/irq/
3249
3242ISAPNP 3250ISAPNP
3243M: Jaroslav Kysela <perex@perex.cz> 3251M: Jaroslav Kysela <perex@perex.cz>
3244S: Maintained 3252S: Maintained
@@ -4805,6 +4813,15 @@ F: fs/qnx4/
4805F: include/linux/qnx4_fs.h 4813F: include/linux/qnx4_fs.h
4806F: include/linux/qnxtypes.h 4814F: include/linux/qnxtypes.h
4807 4815
4816RADOS BLOCK DEVICE (RBD)
4817F: include/linux/qnxtypes.h
4818M: Yehuda Sadeh <yehuda@hq.newdream.net>
4819M: Sage Weil <sage@newdream.net>
4820M: ceph-devel@vger.kernel.org
4821S: Supported
4822F: drivers/block/rbd.c
4823F: drivers/block/rbd_types.h
4824
4808RADEON FRAMEBUFFER DISPLAY DRIVER 4825RADEON FRAMEBUFFER DISPLAY DRIVER
4809M: Benjamin Herrenschmidt <benh@kernel.crashing.org> 4826M: Benjamin Herrenschmidt <benh@kernel.crashing.org>
4810L: linux-fbdev@vger.kernel.org 4827L: linux-fbdev@vger.kernel.org
diff --git a/arch/arm/include/asm/hw_irq.h b/arch/arm/include/asm/hw_irq.h
index 90831f6f5f5c..5586b7c8ef6f 100644
--- a/arch/arm/include/asm/hw_irq.h
+++ b/arch/arm/include/asm/hw_irq.h
@@ -24,4 +24,6 @@ void set_irq_flags(unsigned int irq, unsigned int flags);
24#define IRQF_PROBE (1 << 1) 24#define IRQF_PROBE (1 << 1)
25#define IRQF_NOAUTOEN (1 << 2) 25#define IRQF_NOAUTOEN (1 << 2)
26 26
27#define ARCH_IRQ_INIT_FLAGS (IRQ_NOREQUEST | IRQ_NOPROBE)
28
27#endif 29#endif
diff --git a/arch/arm/kernel/irq.c b/arch/arm/kernel/irq.c
index c0d5c3b3a760..36ad3be4692a 100644
--- a/arch/arm/kernel/irq.c
+++ b/arch/arm/kernel/irq.c
@@ -154,14 +154,6 @@ void set_irq_flags(unsigned int irq, unsigned int iflags)
154 154
155void __init init_IRQ(void) 155void __init init_IRQ(void)
156{ 156{
157 struct irq_desc *desc;
158 int irq;
159
160 for (irq = 0; irq < nr_irqs; irq++) {
161 desc = irq_to_desc_alloc_node(irq, 0);
162 desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE;
163 }
164
165 init_arch_irq(); 157 init_arch_irq();
166} 158}
167 159
@@ -169,7 +161,7 @@ void __init init_IRQ(void)
169int __init arch_probe_nr_irqs(void) 161int __init arch_probe_nr_irqs(void)
170{ 162{
171 nr_irqs = arch_nr_irqs ? arch_nr_irqs : NR_IRQS; 163 nr_irqs = arch_nr_irqs ? arch_nr_irqs : NR_IRQS;
172 return 0; 164 return nr_irqs;
173} 165}
174#endif 166#endif
175 167
diff --git a/arch/arm/mach-bcmring/dma.c b/arch/arm/mach-bcmring/dma.c
index 29c0a911df26..77eb35c89cd0 100644
--- a/arch/arm/mach-bcmring/dma.c
+++ b/arch/arm/mach-bcmring/dma.c
@@ -691,7 +691,7 @@ int dma_init(void)
691 691
692 memset(&gDMA, 0, sizeof(gDMA)); 692 memset(&gDMA, 0, sizeof(gDMA));
693 693
694 init_MUTEX_LOCKED(&gDMA.lock); 694 sema_init(&gDMA.lock, 0);
695 init_waitqueue_head(&gDMA.freeChannelQ); 695 init_waitqueue_head(&gDMA.freeChannelQ);
696 696
697 /* Initialize the Hardware */ 697 /* Initialize the Hardware */
@@ -1574,7 +1574,7 @@ int dma_init_mem_map(DMA_MemMap_t *memMap)
1574{ 1574{
1575 memset(memMap, 0, sizeof(*memMap)); 1575 memset(memMap, 0, sizeof(*memMap));
1576 1576
1577 init_MUTEX(&memMap->lock); 1577 sema_init(&memMap->lock, 1);
1578 1578
1579 return 0; 1579 return 0;
1580} 1580}
diff --git a/arch/arm/mach-bcmring/irq.c b/arch/arm/mach-bcmring/irq.c
index dc1c4939b0ce..e3152631eb37 100644
--- a/arch/arm/mach-bcmring/irq.c
+++ b/arch/arm/mach-bcmring/irq.c
@@ -67,21 +67,21 @@ static void bcmring_unmask_irq2(unsigned int irq)
67} 67}
68 68
69static struct irq_chip bcmring_irq0_chip = { 69static struct irq_chip bcmring_irq0_chip = {
70 .typename = "ARM-INTC0", 70 .name = "ARM-INTC0",
71 .ack = bcmring_mask_irq0, 71 .ack = bcmring_mask_irq0,
72 .mask = bcmring_mask_irq0, /* mask a specific interrupt, blocking its delivery. */ 72 .mask = bcmring_mask_irq0, /* mask a specific interrupt, blocking its delivery. */
73 .unmask = bcmring_unmask_irq0, /* unmaks an interrupt */ 73 .unmask = bcmring_unmask_irq0, /* unmaks an interrupt */
74}; 74};
75 75
76static struct irq_chip bcmring_irq1_chip = { 76static struct irq_chip bcmring_irq1_chip = {
77 .typename = "ARM-INTC1", 77 .name = "ARM-INTC1",
78 .ack = bcmring_mask_irq1, 78 .ack = bcmring_mask_irq1,
79 .mask = bcmring_mask_irq1, 79 .mask = bcmring_mask_irq1,
80 .unmask = bcmring_unmask_irq1, 80 .unmask = bcmring_unmask_irq1,
81}; 81};
82 82
83static struct irq_chip bcmring_irq2_chip = { 83static struct irq_chip bcmring_irq2_chip = {
84 .typename = "ARM-SINTC", 84 .name = "ARM-SINTC",
85 .ack = bcmring_mask_irq2, 85 .ack = bcmring_mask_irq2,
86 .mask = bcmring_mask_irq2, 86 .mask = bcmring_mask_irq2,
87 .unmask = bcmring_unmask_irq2, 87 .unmask = bcmring_unmask_irq2,
diff --git a/arch/arm/mach-iop13xx/msi.c b/arch/arm/mach-iop13xx/msi.c
index f34b0ed80630..7149fcc16c8a 100644
--- a/arch/arm/mach-iop13xx/msi.c
+++ b/arch/arm/mach-iop13xx/msi.c
@@ -164,10 +164,10 @@ static void iop13xx_msi_nop(unsigned int irq)
164static struct irq_chip iop13xx_msi_chip = { 164static struct irq_chip iop13xx_msi_chip = {
165 .name = "PCI-MSI", 165 .name = "PCI-MSI",
166 .ack = iop13xx_msi_nop, 166 .ack = iop13xx_msi_nop,
167 .enable = unmask_msi_irq, 167 .irq_enable = unmask_msi_irq,
168 .disable = mask_msi_irq, 168 .irq_disable = mask_msi_irq,
169 .mask = mask_msi_irq, 169 .irq_mask = mask_msi_irq,
170 .unmask = unmask_msi_irq, 170 .irq_unmask = unmask_msi_irq,
171}; 171};
172 172
173int arch_setup_msi_irq(struct pci_dev *pdev, struct msi_desc *desc) 173int arch_setup_msi_irq(struct pci_dev *pdev, struct msi_desc *desc)
diff --git a/arch/ia64/include/asm/hardirq.h b/arch/ia64/include/asm/hardirq.h
index d514cd9edb49..8fb7d33a661f 100644
--- a/arch/ia64/include/asm/hardirq.h
+++ b/arch/ia64/include/asm/hardirq.h
@@ -6,12 +6,6 @@
6 * David Mosberger-Tang <davidm@hpl.hp.com> 6 * David Mosberger-Tang <davidm@hpl.hp.com>
7 */ 7 */
8 8
9
10#include <linux/threads.h>
11#include <linux/irq.h>
12
13#include <asm/processor.h>
14
15/* 9/*
16 * No irq_cpustat_t for IA-64. The data is held in the per-CPU data structure. 10 * No irq_cpustat_t for IA-64. The data is held in the per-CPU data structure.
17 */ 11 */
@@ -20,6 +14,11 @@
20 14
21#define local_softirq_pending() (local_cpu_data->softirq_pending) 15#define local_softirq_pending() (local_cpu_data->softirq_pending)
22 16
17#include <linux/threads.h>
18#include <linux/irq.h>
19
20#include <asm/processor.h>
21
23extern void __iomem *ipi_base_addr; 22extern void __iomem *ipi_base_addr;
24 23
25void ack_bad_irq(unsigned int irq); 24void ack_bad_irq(unsigned int irq);
diff --git a/arch/ia64/include/asm/system.h b/arch/ia64/include/asm/system.h
index 9f342a574ce8..dd028f2b13b3 100644
--- a/arch/ia64/include/asm/system.h
+++ b/arch/ia64/include/asm/system.h
@@ -272,10 +272,6 @@ void cpu_idle_wait(void);
272 272
273void default_idle(void); 273void default_idle(void);
274 274
275#ifdef CONFIG_VIRT_CPU_ACCOUNTING
276extern void account_system_vtime(struct task_struct *);
277#endif
278
279#endif /* __KERNEL__ */ 275#endif /* __KERNEL__ */
280 276
281#endif /* __ASSEMBLY__ */ 277#endif /* __ASSEMBLY__ */
diff --git a/arch/ia64/kernel/msi_ia64.c b/arch/ia64/kernel/msi_ia64.c
index 4a746ea838ff..00b19a416eab 100644
--- a/arch/ia64/kernel/msi_ia64.c
+++ b/arch/ia64/kernel/msi_ia64.c
@@ -104,8 +104,8 @@ static int ia64_msi_retrigger_irq(unsigned int irq)
104 */ 104 */
105static struct irq_chip ia64_msi_chip = { 105static struct irq_chip ia64_msi_chip = {
106 .name = "PCI-MSI", 106 .name = "PCI-MSI",
107 .mask = mask_msi_irq, 107 .irq_mask = mask_msi_irq,
108 .unmask = unmask_msi_irq, 108 .irq_unmask = unmask_msi_irq,
109 .ack = ia64_ack_msi_irq, 109 .ack = ia64_ack_msi_irq,
110#ifdef CONFIG_SMP 110#ifdef CONFIG_SMP
111 .set_affinity = ia64_set_msi_irq_affinity, 111 .set_affinity = ia64_set_msi_irq_affinity,
@@ -160,8 +160,8 @@ static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
160 160
161static struct irq_chip dmar_msi_type = { 161static struct irq_chip dmar_msi_type = {
162 .name = "DMAR_MSI", 162 .name = "DMAR_MSI",
163 .unmask = dmar_msi_unmask, 163 .irq_unmask = dmar_msi_unmask,
164 .mask = dmar_msi_mask, 164 .irq_mask = dmar_msi_mask,
165 .ack = ia64_ack_msi_irq, 165 .ack = ia64_ack_msi_irq,
166#ifdef CONFIG_SMP 166#ifdef CONFIG_SMP
167 .set_affinity = dmar_msi_set_affinity, 167 .set_affinity = dmar_msi_set_affinity,
diff --git a/arch/ia64/sn/kernel/msi_sn.c b/arch/ia64/sn/kernel/msi_sn.c
index 0c72dd463831..a5e500f02853 100644
--- a/arch/ia64/sn/kernel/msi_sn.c
+++ b/arch/ia64/sn/kernel/msi_sn.c
@@ -228,8 +228,8 @@ static int sn_msi_retrigger_irq(unsigned int irq)
228 228
229static struct irq_chip sn_msi_chip = { 229static struct irq_chip sn_msi_chip = {
230 .name = "PCI-MSI", 230 .name = "PCI-MSI",
231 .mask = mask_msi_irq, 231 .irq_mask = mask_msi_irq,
232 .unmask = unmask_msi_irq, 232 .irq_unmask = unmask_msi_irq,
233 .ack = sn_ack_msi_irq, 233 .ack = sn_ack_msi_irq,
234#ifdef CONFIG_SMP 234#ifdef CONFIG_SMP
235 .set_affinity = sn_set_msi_irq_affinity, 235 .set_affinity = sn_set_msi_irq_affinity,
diff --git a/arch/m32r/kernel/irq.c b/arch/m32r/kernel/irq.c
index 3c71f776872c..7db26f1f082d 100644
--- a/arch/m32r/kernel/irq.c
+++ b/arch/m32r/kernel/irq.c
@@ -51,7 +51,7 @@ int show_interrupts(struct seq_file *p, void *v)
51 for_each_online_cpu(j) 51 for_each_online_cpu(j)
52 seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); 52 seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
53#endif 53#endif
54 seq_printf(p, " %14s", irq_desc[i].chip->typename); 54 seq_printf(p, " %14s", irq_desc[i].chip->name);
55 seq_printf(p, " %s", action->name); 55 seq_printf(p, " %s", action->name);
56 56
57 for (action=action->next; action; action = action->next) 57 for (action=action->next; action; action = action->next)
diff --git a/arch/m32r/platforms/m32104ut/setup.c b/arch/m32r/platforms/m32104ut/setup.c
index 922fdfdadeaa..402a59d7219b 100644
--- a/arch/m32r/platforms/m32104ut/setup.c
+++ b/arch/m32r/platforms/m32104ut/setup.c
@@ -65,7 +65,7 @@ static void shutdown_m32104ut_irq(unsigned int irq)
65 65
66static struct irq_chip m32104ut_irq_type = 66static struct irq_chip m32104ut_irq_type =
67{ 67{
68 .typename = "M32104UT-IRQ", 68 .name = "M32104UT-IRQ",
69 .startup = startup_m32104ut_irq, 69 .startup = startup_m32104ut_irq,
70 .shutdown = shutdown_m32104ut_irq, 70 .shutdown = shutdown_m32104ut_irq,
71 .enable = enable_m32104ut_irq, 71 .enable = enable_m32104ut_irq,
diff --git a/arch/m32r/platforms/m32700ut/setup.c b/arch/m32r/platforms/m32700ut/setup.c
index 9c1bc7487c1e..80b1a026795a 100644
--- a/arch/m32r/platforms/m32700ut/setup.c
+++ b/arch/m32r/platforms/m32700ut/setup.c
@@ -71,7 +71,7 @@ static void shutdown_m32700ut_irq(unsigned int irq)
71 71
72static struct irq_chip m32700ut_irq_type = 72static struct irq_chip m32700ut_irq_type =
73{ 73{
74 .typename = "M32700UT-IRQ", 74 .name = "M32700UT-IRQ",
75 .startup = startup_m32700ut_irq, 75 .startup = startup_m32700ut_irq,
76 .shutdown = shutdown_m32700ut_irq, 76 .shutdown = shutdown_m32700ut_irq,
77 .enable = enable_m32700ut_irq, 77 .enable = enable_m32700ut_irq,
@@ -148,7 +148,7 @@ static void shutdown_m32700ut_pld_irq(unsigned int irq)
148 148
149static struct irq_chip m32700ut_pld_irq_type = 149static struct irq_chip m32700ut_pld_irq_type =
150{ 150{
151 .typename = "M32700UT-PLD-IRQ", 151 .name = "M32700UT-PLD-IRQ",
152 .startup = startup_m32700ut_pld_irq, 152 .startup = startup_m32700ut_pld_irq,
153 .shutdown = shutdown_m32700ut_pld_irq, 153 .shutdown = shutdown_m32700ut_pld_irq,
154 .enable = enable_m32700ut_pld_irq, 154 .enable = enable_m32700ut_pld_irq,
@@ -217,7 +217,7 @@ static void shutdown_m32700ut_lanpld_irq(unsigned int irq)
217 217
218static struct irq_chip m32700ut_lanpld_irq_type = 218static struct irq_chip m32700ut_lanpld_irq_type =
219{ 219{
220 .typename = "M32700UT-PLD-LAN-IRQ", 220 .name = "M32700UT-PLD-LAN-IRQ",
221 .startup = startup_m32700ut_lanpld_irq, 221 .startup = startup_m32700ut_lanpld_irq,
222 .shutdown = shutdown_m32700ut_lanpld_irq, 222 .shutdown = shutdown_m32700ut_lanpld_irq,
223 .enable = enable_m32700ut_lanpld_irq, 223 .enable = enable_m32700ut_lanpld_irq,
@@ -286,7 +286,7 @@ static void shutdown_m32700ut_lcdpld_irq(unsigned int irq)
286 286
287static struct irq_chip m32700ut_lcdpld_irq_type = 287static struct irq_chip m32700ut_lcdpld_irq_type =
288{ 288{
289 .typename = "M32700UT-PLD-LCD-IRQ", 289 .name = "M32700UT-PLD-LCD-IRQ",
290 .startup = startup_m32700ut_lcdpld_irq, 290 .startup = startup_m32700ut_lcdpld_irq,
291 .shutdown = shutdown_m32700ut_lcdpld_irq, 291 .shutdown = shutdown_m32700ut_lcdpld_irq,
292 .enable = enable_m32700ut_lcdpld_irq, 292 .enable = enable_m32700ut_lcdpld_irq,
diff --git a/arch/m32r/platforms/mappi/setup.c b/arch/m32r/platforms/mappi/setup.c
index fb4b17799b66..ea00c84d6b1b 100644
--- a/arch/m32r/platforms/mappi/setup.c
+++ b/arch/m32r/platforms/mappi/setup.c
@@ -65,7 +65,7 @@ static void shutdown_mappi_irq(unsigned int irq)
65 65
66static struct irq_chip mappi_irq_type = 66static struct irq_chip mappi_irq_type =
67{ 67{
68 .typename = "MAPPI-IRQ", 68 .name = "MAPPI-IRQ",
69 .startup = startup_mappi_irq, 69 .startup = startup_mappi_irq,
70 .shutdown = shutdown_mappi_irq, 70 .shutdown = shutdown_mappi_irq,
71 .enable = enable_mappi_irq, 71 .enable = enable_mappi_irq,
diff --git a/arch/m32r/platforms/mappi2/setup.c b/arch/m32r/platforms/mappi2/setup.c
index 6a65eda0a056..c049376d0270 100644
--- a/arch/m32r/platforms/mappi2/setup.c
+++ b/arch/m32r/platforms/mappi2/setup.c
@@ -72,7 +72,7 @@ static void shutdown_mappi2_irq(unsigned int irq)
72 72
73static struct irq_chip mappi2_irq_type = 73static struct irq_chip mappi2_irq_type =
74{ 74{
75 .typename = "MAPPI2-IRQ", 75 .name = "MAPPI2-IRQ",
76 .startup = startup_mappi2_irq, 76 .startup = startup_mappi2_irq,
77 .shutdown = shutdown_mappi2_irq, 77 .shutdown = shutdown_mappi2_irq,
78 .enable = enable_mappi2_irq, 78 .enable = enable_mappi2_irq,
diff --git a/arch/m32r/platforms/mappi3/setup.c b/arch/m32r/platforms/mappi3/setup.c
index 9c337aeac94b..882de25c6e8c 100644
--- a/arch/m32r/platforms/mappi3/setup.c
+++ b/arch/m32r/platforms/mappi3/setup.c
@@ -72,7 +72,7 @@ static void shutdown_mappi3_irq(unsigned int irq)
72 72
73static struct irq_chip mappi3_irq_type = 73static struct irq_chip mappi3_irq_type =
74{ 74{
75 .typename = "MAPPI3-IRQ", 75 .name = "MAPPI3-IRQ",
76 .startup = startup_mappi3_irq, 76 .startup = startup_mappi3_irq,
77 .shutdown = shutdown_mappi3_irq, 77 .shutdown = shutdown_mappi3_irq,
78 .enable = enable_mappi3_irq, 78 .enable = enable_mappi3_irq,
diff --git a/arch/m32r/platforms/oaks32r/setup.c b/arch/m32r/platforms/oaks32r/setup.c
index ed865741c38d..d11d93bf74f5 100644
--- a/arch/m32r/platforms/oaks32r/setup.c
+++ b/arch/m32r/platforms/oaks32r/setup.c
@@ -63,7 +63,7 @@ static void shutdown_oaks32r_irq(unsigned int irq)
63 63
64static struct irq_chip oaks32r_irq_type = 64static struct irq_chip oaks32r_irq_type =
65{ 65{
66 .typename = "OAKS32R-IRQ", 66 .name = "OAKS32R-IRQ",
67 .startup = startup_oaks32r_irq, 67 .startup = startup_oaks32r_irq,
68 .shutdown = shutdown_oaks32r_irq, 68 .shutdown = shutdown_oaks32r_irq,
69 .enable = enable_oaks32r_irq, 69 .enable = enable_oaks32r_irq,
diff --git a/arch/m32r/platforms/opsput/setup.c b/arch/m32r/platforms/opsput/setup.c
index 80d680657019..5f3402a2fbaf 100644
--- a/arch/m32r/platforms/opsput/setup.c
+++ b/arch/m32r/platforms/opsput/setup.c
@@ -72,7 +72,7 @@ static void shutdown_opsput_irq(unsigned int irq)
72 72
73static struct irq_chip opsput_irq_type = 73static struct irq_chip opsput_irq_type =
74{ 74{
75 .typename = "OPSPUT-IRQ", 75 .name = "OPSPUT-IRQ",
76 .startup = startup_opsput_irq, 76 .startup = startup_opsput_irq,
77 .shutdown = shutdown_opsput_irq, 77 .shutdown = shutdown_opsput_irq,
78 .enable = enable_opsput_irq, 78 .enable = enable_opsput_irq,
@@ -149,7 +149,7 @@ static void shutdown_opsput_pld_irq(unsigned int irq)
149 149
150static struct irq_chip opsput_pld_irq_type = 150static struct irq_chip opsput_pld_irq_type =
151{ 151{
152 .typename = "OPSPUT-PLD-IRQ", 152 .name = "OPSPUT-PLD-IRQ",
153 .startup = startup_opsput_pld_irq, 153 .startup = startup_opsput_pld_irq,
154 .shutdown = shutdown_opsput_pld_irq, 154 .shutdown = shutdown_opsput_pld_irq,
155 .enable = enable_opsput_pld_irq, 155 .enable = enable_opsput_pld_irq,
@@ -218,7 +218,7 @@ static void shutdown_opsput_lanpld_irq(unsigned int irq)
218 218
219static struct irq_chip opsput_lanpld_irq_type = 219static struct irq_chip opsput_lanpld_irq_type =
220{ 220{
221 .typename = "OPSPUT-PLD-LAN-IRQ", 221 .name = "OPSPUT-PLD-LAN-IRQ",
222 .startup = startup_opsput_lanpld_irq, 222 .startup = startup_opsput_lanpld_irq,
223 .shutdown = shutdown_opsput_lanpld_irq, 223 .shutdown = shutdown_opsput_lanpld_irq,
224 .enable = enable_opsput_lanpld_irq, 224 .enable = enable_opsput_lanpld_irq,
diff --git a/arch/m32r/platforms/usrv/setup.c b/arch/m32r/platforms/usrv/setup.c
index 757302660af8..1beac7a51ed4 100644
--- a/arch/m32r/platforms/usrv/setup.c
+++ b/arch/m32r/platforms/usrv/setup.c
@@ -63,7 +63,7 @@ static void shutdown_mappi_irq(unsigned int irq)
63 63
64static struct irq_chip mappi_irq_type = 64static struct irq_chip mappi_irq_type =
65{ 65{
66 .typename = "M32700-IRQ", 66 .name = "M32700-IRQ",
67 .startup = startup_mappi_irq, 67 .startup = startup_mappi_irq,
68 .shutdown = shutdown_mappi_irq, 68 .shutdown = shutdown_mappi_irq,
69 .enable = enable_mappi_irq, 69 .enable = enable_mappi_irq,
@@ -136,7 +136,7 @@ static void shutdown_m32700ut_pld_irq(unsigned int irq)
136 136
137static struct irq_chip m32700ut_pld_irq_type = 137static struct irq_chip m32700ut_pld_irq_type =
138{ 138{
139 .typename = "USRV-PLD-IRQ", 139 .name = "USRV-PLD-IRQ",
140 .startup = startup_m32700ut_pld_irq, 140 .startup = startup_m32700ut_pld_irq,
141 .shutdown = shutdown_m32700ut_pld_irq, 141 .shutdown = shutdown_m32700ut_pld_irq,
142 .enable = enable_m32700ut_pld_irq, 142 .enable = enable_m32700ut_pld_irq,
diff --git a/arch/mips/kernel/mips-mt-fpaff.c b/arch/mips/kernel/mips-mt-fpaff.c
index 2340f11dc29c..9a526ba6f257 100644
--- a/arch/mips/kernel/mips-mt-fpaff.c
+++ b/arch/mips/kernel/mips-mt-fpaff.c
@@ -103,7 +103,7 @@ asmlinkage long mipsmt_sys_sched_setaffinity(pid_t pid, unsigned int len,
103 if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) 103 if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
104 goto out_unlock; 104 goto out_unlock;
105 105
106 retval = security_task_setscheduler(p, 0, NULL); 106 retval = security_task_setscheduler(p)
107 if (retval) 107 if (retval)
108 goto out_unlock; 108 goto out_unlock;
109 109
diff --git a/arch/powerpc/include/asm/system.h b/arch/powerpc/include/asm/system.h
index 6c294acac848..9c3d160670b4 100644
--- a/arch/powerpc/include/asm/system.h
+++ b/arch/powerpc/include/asm/system.h
@@ -542,10 +542,6 @@ extern void reloc_got2(unsigned long);
542 542
543#define PTRRELOC(x) ((typeof(x)) add_reloc_offset((unsigned long)(x))) 543#define PTRRELOC(x) ((typeof(x)) add_reloc_offset((unsigned long)(x)))
544 544
545#ifdef CONFIG_VIRT_CPU_ACCOUNTING
546extern void account_system_vtime(struct task_struct *);
547#endif
548
549extern struct dentry *powerpc_debugfs_root; 545extern struct dentry *powerpc_debugfs_root;
550 546
551#endif /* __KERNEL__ */ 547#endif /* __KERNEL__ */
diff --git a/arch/powerpc/platforms/cell/axon_msi.c b/arch/powerpc/platforms/cell/axon_msi.c
index 97085530aa63..e3e379c6caa7 100644
--- a/arch/powerpc/platforms/cell/axon_msi.c
+++ b/arch/powerpc/platforms/cell/axon_msi.c
@@ -310,9 +310,9 @@ static void axon_msi_teardown_msi_irqs(struct pci_dev *dev)
310} 310}
311 311
312static struct irq_chip msic_irq_chip = { 312static struct irq_chip msic_irq_chip = {
313 .mask = mask_msi_irq, 313 .irq_mask = mask_msi_irq,
314 .unmask = unmask_msi_irq, 314 .irq_unmask = unmask_msi_irq,
315 .shutdown = unmask_msi_irq, 315 .irq_shutdown = mask_msi_irq,
316 .name = "AXON-MSI", 316 .name = "AXON-MSI",
317}; 317};
318 318
diff --git a/arch/powerpc/platforms/pseries/xics.c b/arch/powerpc/platforms/pseries/xics.c
index 93834b0d8272..67e2c4bdac8f 100644
--- a/arch/powerpc/platforms/pseries/xics.c
+++ b/arch/powerpc/platforms/pseries/xics.c
@@ -243,7 +243,7 @@ static unsigned int xics_startup(unsigned int virq)
243 * at that level, so we do it here by hand. 243 * at that level, so we do it here by hand.
244 */ 244 */
245 if (irq_to_desc(virq)->msi_desc) 245 if (irq_to_desc(virq)->msi_desc)
246 unmask_msi_irq(virq); 246 unmask_msi_irq(irq_get_irq_data(virq));
247 247
248 /* unmask it */ 248 /* unmask it */
249 xics_unmask_irq(virq); 249 xics_unmask_irq(virq);
diff --git a/arch/powerpc/sysdev/fsl_msi.c b/arch/powerpc/sysdev/fsl_msi.c
index 87991d3abbab..bdbd896c89d8 100644
--- a/arch/powerpc/sysdev/fsl_msi.c
+++ b/arch/powerpc/sysdev/fsl_msi.c
@@ -51,8 +51,8 @@ static void fsl_msi_end_irq(unsigned int virq)
51} 51}
52 52
53static struct irq_chip fsl_msi_chip = { 53static struct irq_chip fsl_msi_chip = {
54 .mask = mask_msi_irq, 54 .irq_mask = mask_msi_irq,
55 .unmask = unmask_msi_irq, 55 .irq_unmask = unmask_msi_irq,
56 .ack = fsl_msi_end_irq, 56 .ack = fsl_msi_end_irq,
57 .name = "FSL-MSI", 57 .name = "FSL-MSI",
58}; 58};
diff --git a/arch/powerpc/sysdev/mpic_pasemi_msi.c b/arch/powerpc/sysdev/mpic_pasemi_msi.c
index 3b6a9a43718f..320ad5a9a25d 100644
--- a/arch/powerpc/sysdev/mpic_pasemi_msi.c
+++ b/arch/powerpc/sysdev/mpic_pasemi_msi.c
@@ -39,24 +39,24 @@
39static struct mpic *msi_mpic; 39static struct mpic *msi_mpic;
40 40
41 41
42static void mpic_pasemi_msi_mask_irq(unsigned int irq) 42static void mpic_pasemi_msi_mask_irq(struct irq_data *data)
43{ 43{
44 pr_debug("mpic_pasemi_msi_mask_irq %d\n", irq); 44 pr_debug("mpic_pasemi_msi_mask_irq %d\n", data->irq);
45 mask_msi_irq(irq); 45 mask_msi_irq(data);
46 mpic_mask_irq(irq); 46 mpic_mask_irq(data->irq);
47} 47}
48 48
49static void mpic_pasemi_msi_unmask_irq(unsigned int irq) 49static void mpic_pasemi_msi_unmask_irq(struct irq_data *data)
50{ 50{
51 pr_debug("mpic_pasemi_msi_unmask_irq %d\n", irq); 51 pr_debug("mpic_pasemi_msi_unmask_irq %d\n", data->irq);
52 mpic_unmask_irq(irq); 52 mpic_unmask_irq(data->irq);
53 unmask_msi_irq(irq); 53 unmask_msi_irq(data);
54} 54}
55 55
56static struct irq_chip mpic_pasemi_msi_chip = { 56static struct irq_chip mpic_pasemi_msi_chip = {
57 .shutdown = mpic_pasemi_msi_mask_irq, 57 .irq_shutdown = mpic_pasemi_msi_mask_irq,
58 .mask = mpic_pasemi_msi_mask_irq, 58 .irq_mask = mpic_pasemi_msi_mask_irq,
59 .unmask = mpic_pasemi_msi_unmask_irq, 59 .irq_unmask = mpic_pasemi_msi_unmask_irq,
60 .eoi = mpic_end_irq, 60 .eoi = mpic_end_irq,
61 .set_type = mpic_set_irq_type, 61 .set_type = mpic_set_irq_type,
62 .set_affinity = mpic_set_affinity, 62 .set_affinity = mpic_set_affinity,
diff --git a/arch/powerpc/sysdev/mpic_u3msi.c b/arch/powerpc/sysdev/mpic_u3msi.c
index bcbfe79c704b..a2b028b4a202 100644
--- a/arch/powerpc/sysdev/mpic_u3msi.c
+++ b/arch/powerpc/sysdev/mpic_u3msi.c
@@ -23,22 +23,22 @@
23/* A bit ugly, can we get this from the pci_dev somehow? */ 23/* A bit ugly, can we get this from the pci_dev somehow? */
24static struct mpic *msi_mpic; 24static struct mpic *msi_mpic;
25 25
26static void mpic_u3msi_mask_irq(unsigned int irq) 26static void mpic_u3msi_mask_irq(struct irq_data *data)
27{ 27{
28 mask_msi_irq(irq); 28 mask_msi_irq(data);
29 mpic_mask_irq(irq); 29 mpic_mask_irq(data->irq);
30} 30}
31 31
32static void mpic_u3msi_unmask_irq(unsigned int irq) 32static void mpic_u3msi_unmask_irq(struct irq_data *data)
33{ 33{
34 mpic_unmask_irq(irq); 34 mpic_unmask_irq(data->irq);
35 unmask_msi_irq(irq); 35 unmask_msi_irq(data);
36} 36}
37 37
38static struct irq_chip mpic_u3msi_chip = { 38static struct irq_chip mpic_u3msi_chip = {
39 .shutdown = mpic_u3msi_mask_irq, 39 .irq_shutdown = mpic_u3msi_mask_irq,
40 .mask = mpic_u3msi_mask_irq, 40 .irq_mask = mpic_u3msi_mask_irq,
41 .unmask = mpic_u3msi_unmask_irq, 41 .irq_unmask = mpic_u3msi_unmask_irq,
42 .eoi = mpic_end_irq, 42 .eoi = mpic_end_irq,
43 .set_type = mpic_set_irq_type, 43 .set_type = mpic_set_irq_type,
44 .set_affinity = mpic_set_affinity, 44 .set_affinity = mpic_set_affinity,
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 958f0dadeadf..75976a141947 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -199,6 +199,13 @@ config HOTPLUG_CPU
199 can be controlled through /sys/devices/system/cpu/cpu#. 199 can be controlled through /sys/devices/system/cpu/cpu#.
200 Say N if you want to disable CPU hotplug. 200 Say N if you want to disable CPU hotplug.
201 201
202config SCHED_BOOK
203 bool "Book scheduler support"
204 depends on SMP
205 help
206 Book scheduler support improves the CPU scheduler's decision making
207 when dealing with machines that have several books.
208
202config MATHEMU 209config MATHEMU
203 bool "IEEE FPU emulation" 210 bool "IEEE FPU emulation"
204 depends on MARCH_G5 211 depends on MARCH_G5
diff --git a/arch/s390/include/asm/hardirq.h b/arch/s390/include/asm/hardirq.h
index 498bc3892385..881d94590aeb 100644
--- a/arch/s390/include/asm/hardirq.h
+++ b/arch/s390/include/asm/hardirq.h
@@ -12,10 +12,6 @@
12#ifndef __ASM_HARDIRQ_H 12#ifndef __ASM_HARDIRQ_H
13#define __ASM_HARDIRQ_H 13#define __ASM_HARDIRQ_H
14 14
15#include <linux/threads.h>
16#include <linux/sched.h>
17#include <linux/cache.h>
18#include <linux/interrupt.h>
19#include <asm/lowcore.h> 15#include <asm/lowcore.h>
20 16
21#define local_softirq_pending() (S390_lowcore.softirq_pending) 17#define local_softirq_pending() (S390_lowcore.softirq_pending)
diff --git a/arch/s390/include/asm/system.h b/arch/s390/include/asm/system.h
index cef66210c846..38ddd8a9a9e8 100644
--- a/arch/s390/include/asm/system.h
+++ b/arch/s390/include/asm/system.h
@@ -97,7 +97,6 @@ static inline void restore_access_regs(unsigned int *acrs)
97 97
98extern void account_vtime(struct task_struct *, struct task_struct *); 98extern void account_vtime(struct task_struct *, struct task_struct *);
99extern void account_tick_vtime(struct task_struct *); 99extern void account_tick_vtime(struct task_struct *);
100extern void account_system_vtime(struct task_struct *);
101 100
102#ifdef CONFIG_PFAULT 101#ifdef CONFIG_PFAULT
103extern void pfault_irq_init(void); 102extern void pfault_irq_init(void);
diff --git a/arch/s390/include/asm/topology.h b/arch/s390/include/asm/topology.h
index 831bd033ea77..051107a2c5e2 100644
--- a/arch/s390/include/asm/topology.h
+++ b/arch/s390/include/asm/topology.h
@@ -3,15 +3,32 @@
3 3
4#include <linux/cpumask.h> 4#include <linux/cpumask.h>
5 5
6#define mc_capable() (1)
7
8const struct cpumask *cpu_coregroup_mask(unsigned int cpu);
9
10extern unsigned char cpu_core_id[NR_CPUS]; 6extern unsigned char cpu_core_id[NR_CPUS];
11extern cpumask_t cpu_core_map[NR_CPUS]; 7extern cpumask_t cpu_core_map[NR_CPUS];
12 8
9static inline const struct cpumask *cpu_coregroup_mask(unsigned int cpu)
10{
11 return &cpu_core_map[cpu];
12}
13
13#define topology_core_id(cpu) (cpu_core_id[cpu]) 14#define topology_core_id(cpu) (cpu_core_id[cpu])
14#define topology_core_cpumask(cpu) (&cpu_core_map[cpu]) 15#define topology_core_cpumask(cpu) (&cpu_core_map[cpu])
16#define mc_capable() (1)
17
18#ifdef CONFIG_SCHED_BOOK
19
20extern unsigned char cpu_book_id[NR_CPUS];
21extern cpumask_t cpu_book_map[NR_CPUS];
22
23static inline const struct cpumask *cpu_book_mask(unsigned int cpu)
24{
25 return &cpu_book_map[cpu];
26}
27
28#define topology_book_id(cpu) (cpu_book_id[cpu])
29#define topology_book_cpumask(cpu) (&cpu_book_map[cpu])
30
31#endif /* CONFIG_SCHED_BOOK */
15 32
16int topology_set_cpu_management(int fc); 33int topology_set_cpu_management(int fc);
17void topology_schedule_update(void); 34void topology_schedule_update(void);
@@ -30,6 +47,8 @@ static inline void s390_init_cpu_topology(void)
30}; 47};
31#endif 48#endif
32 49
50#define SD_BOOK_INIT SD_CPU_INIT
51
33#include <asm-generic/topology.h> 52#include <asm-generic/topology.h>
34 53
35#endif /* _ASM_S390_TOPOLOGY_H */ 54#endif /* _ASM_S390_TOPOLOGY_H */
diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c
index bcef00766a64..13559c993847 100644
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@@ -57,8 +57,8 @@ struct tl_info {
57 union tl_entry tle[0]; 57 union tl_entry tle[0];
58}; 58};
59 59
60struct core_info { 60struct mask_info {
61 struct core_info *next; 61 struct mask_info *next;
62 unsigned char id; 62 unsigned char id;
63 cpumask_t mask; 63 cpumask_t mask;
64}; 64};
@@ -66,7 +66,6 @@ struct core_info {
66static int topology_enabled; 66static int topology_enabled;
67static void topology_work_fn(struct work_struct *work); 67static void topology_work_fn(struct work_struct *work);
68static struct tl_info *tl_info; 68static struct tl_info *tl_info;
69static struct core_info core_info;
70static int machine_has_topology; 69static int machine_has_topology;
71static struct timer_list topology_timer; 70static struct timer_list topology_timer;
72static void set_topology_timer(void); 71static void set_topology_timer(void);
@@ -74,38 +73,37 @@ static DECLARE_WORK(topology_work, topology_work_fn);
74/* topology_lock protects the core linked list */ 73/* topology_lock protects the core linked list */
75static DEFINE_SPINLOCK(topology_lock); 74static DEFINE_SPINLOCK(topology_lock);
76 75
76static struct mask_info core_info;
77cpumask_t cpu_core_map[NR_CPUS]; 77cpumask_t cpu_core_map[NR_CPUS];
78unsigned char cpu_core_id[NR_CPUS]; 78unsigned char cpu_core_id[NR_CPUS];
79 79
80static cpumask_t cpu_coregroup_map(unsigned int cpu) 80#ifdef CONFIG_SCHED_BOOK
81static struct mask_info book_info;
82cpumask_t cpu_book_map[NR_CPUS];
83unsigned char cpu_book_id[NR_CPUS];
84#endif
85
86static cpumask_t cpu_group_map(struct mask_info *info, unsigned int cpu)
81{ 87{
82 struct core_info *core = &core_info;
83 unsigned long flags;
84 cpumask_t mask; 88 cpumask_t mask;
85 89
86 cpus_clear(mask); 90 cpus_clear(mask);
87 if (!topology_enabled || !machine_has_topology) 91 if (!topology_enabled || !machine_has_topology)
88 return cpu_possible_map; 92 return cpu_possible_map;
89 spin_lock_irqsave(&topology_lock, flags); 93 while (info) {
90 while (core) { 94 if (cpu_isset(cpu, info->mask)) {
91 if (cpu_isset(cpu, core->mask)) { 95 mask = info->mask;
92 mask = core->mask;
93 break; 96 break;
94 } 97 }
95 core = core->next; 98 info = info->next;
96 } 99 }
97 spin_unlock_irqrestore(&topology_lock, flags);
98 if (cpus_empty(mask)) 100 if (cpus_empty(mask))
99 mask = cpumask_of_cpu(cpu); 101 mask = cpumask_of_cpu(cpu);
100 return mask; 102 return mask;
101} 103}
102 104
103const struct cpumask *cpu_coregroup_mask(unsigned int cpu) 105static void add_cpus_to_mask(struct tl_cpu *tl_cpu, struct mask_info *book,
104{ 106 struct mask_info *core)
105 return &cpu_core_map[cpu];
106}
107
108static void add_cpus_to_core(struct tl_cpu *tl_cpu, struct core_info *core)
109{ 107{
110 unsigned int cpu; 108 unsigned int cpu;
111 109
@@ -117,23 +115,35 @@ static void add_cpus_to_core(struct tl_cpu *tl_cpu, struct core_info *core)
117 115
118 rcpu = CPU_BITS - 1 - cpu + tl_cpu->origin; 116 rcpu = CPU_BITS - 1 - cpu + tl_cpu->origin;
119 for_each_present_cpu(lcpu) { 117 for_each_present_cpu(lcpu) {
120 if (cpu_logical_map(lcpu) == rcpu) { 118 if (cpu_logical_map(lcpu) != rcpu)
121 cpu_set(lcpu, core->mask); 119 continue;
122 cpu_core_id[lcpu] = core->id; 120#ifdef CONFIG_SCHED_BOOK
123 smp_cpu_polarization[lcpu] = tl_cpu->pp; 121 cpu_set(lcpu, book->mask);
124 } 122 cpu_book_id[lcpu] = book->id;
123#endif
124 cpu_set(lcpu, core->mask);
125 cpu_core_id[lcpu] = core->id;
126 smp_cpu_polarization[lcpu] = tl_cpu->pp;
125 } 127 }
126 } 128 }
127} 129}
128 130
129static void clear_cores(void) 131static void clear_masks(void)
130{ 132{
131 struct core_info *core = &core_info; 133 struct mask_info *info;
132 134
133 while (core) { 135 info = &core_info;
134 cpus_clear(core->mask); 136 while (info) {
135 core = core->next; 137 cpus_clear(info->mask);
138 info = info->next;
139 }
140#ifdef CONFIG_SCHED_BOOK
141 info = &book_info;
142 while (info) {
143 cpus_clear(info->mask);
144 info = info->next;
136 } 145 }
146#endif
137} 147}
138 148
139static union tl_entry *next_tle(union tl_entry *tle) 149static union tl_entry *next_tle(union tl_entry *tle)
@@ -146,29 +156,36 @@ static union tl_entry *next_tle(union tl_entry *tle)
146 156
147static void tl_to_cores(struct tl_info *info) 157static void tl_to_cores(struct tl_info *info)
148{ 158{
159#ifdef CONFIG_SCHED_BOOK
160 struct mask_info *book = &book_info;
161#else
162 struct mask_info *book = NULL;
163#endif
164 struct mask_info *core = &core_info;
149 union tl_entry *tle, *end; 165 union tl_entry *tle, *end;
150 struct core_info *core = &core_info; 166
151 167
152 spin_lock_irq(&topology_lock); 168 spin_lock_irq(&topology_lock);
153 clear_cores(); 169 clear_masks();
154 tle = info->tle; 170 tle = info->tle;
155 end = (union tl_entry *)((unsigned long)info + info->length); 171 end = (union tl_entry *)((unsigned long)info + info->length);
156 while (tle < end) { 172 while (tle < end) {
157 switch (tle->nl) { 173 switch (tle->nl) {
158 case 5: 174#ifdef CONFIG_SCHED_BOOK
159 case 4:
160 case 3:
161 case 2: 175 case 2:
176 book = book->next;
177 book->id = tle->container.id;
162 break; 178 break;
179#endif
163 case 1: 180 case 1:
164 core = core->next; 181 core = core->next;
165 core->id = tle->container.id; 182 core->id = tle->container.id;
166 break; 183 break;
167 case 0: 184 case 0:
168 add_cpus_to_core(&tle->cpu, core); 185 add_cpus_to_mask(&tle->cpu, book, core);
169 break; 186 break;
170 default: 187 default:
171 clear_cores(); 188 clear_masks();
172 machine_has_topology = 0; 189 machine_has_topology = 0;
173 goto out; 190 goto out;
174 } 191 }
@@ -221,10 +238,29 @@ int topology_set_cpu_management(int fc)
221 238
222static void update_cpu_core_map(void) 239static void update_cpu_core_map(void)
223{ 240{
241 unsigned long flags;
224 int cpu; 242 int cpu;
225 243
226 for_each_possible_cpu(cpu) 244 spin_lock_irqsave(&topology_lock, flags);
227 cpu_core_map[cpu] = cpu_coregroup_map(cpu); 245 for_each_possible_cpu(cpu) {
246 cpu_core_map[cpu] = cpu_group_map(&core_info, cpu);
247#ifdef CONFIG_SCHED_BOOK
248 cpu_book_map[cpu] = cpu_group_map(&book_info, cpu);
249#endif
250 }
251 spin_unlock_irqrestore(&topology_lock, flags);
252}
253
254static void store_topology(struct tl_info *info)
255{
256#ifdef CONFIG_SCHED_BOOK
257 int rc;
258
259 rc = stsi(info, 15, 1, 3);
260 if (rc != -ENOSYS)
261 return;
262#endif
263 stsi(info, 15, 1, 2);
228} 264}
229 265
230int arch_update_cpu_topology(void) 266int arch_update_cpu_topology(void)
@@ -238,7 +274,7 @@ int arch_update_cpu_topology(void)
238 topology_update_polarization_simple(); 274 topology_update_polarization_simple();
239 return 0; 275 return 0;
240 } 276 }
241 stsi(info, 15, 1, 2); 277 store_topology(info);
242 tl_to_cores(info); 278 tl_to_cores(info);
243 update_cpu_core_map(); 279 update_cpu_core_map();
244 for_each_online_cpu(cpu) { 280 for_each_online_cpu(cpu) {
@@ -299,12 +335,24 @@ out:
299} 335}
300__initcall(init_topology_update); 336__initcall(init_topology_update);
301 337
338static void alloc_masks(struct tl_info *info, struct mask_info *mask, int offset)
339{
340 int i, nr_masks;
341
342 nr_masks = info->mag[NR_MAG - offset];
343 for (i = 0; i < info->mnest - offset; i++)
344 nr_masks *= info->mag[NR_MAG - offset - 1 - i];
345 nr_masks = max(nr_masks, 1);
346 for (i = 0; i < nr_masks; i++) {
347 mask->next = alloc_bootmem(sizeof(struct mask_info));
348 mask = mask->next;
349 }
350}
351
302void __init s390_init_cpu_topology(void) 352void __init s390_init_cpu_topology(void)
303{ 353{
304 unsigned long long facility_bits; 354 unsigned long long facility_bits;
305 struct tl_info *info; 355 struct tl_info *info;
306 struct core_info *core;
307 int nr_cores;
308 int i; 356 int i;
309 357
310 if (stfle(&facility_bits, 1) <= 0) 358 if (stfle(&facility_bits, 1) <= 0)
@@ -315,25 +363,13 @@ void __init s390_init_cpu_topology(void)
315 363
316 tl_info = alloc_bootmem_pages(PAGE_SIZE); 364 tl_info = alloc_bootmem_pages(PAGE_SIZE);
317 info = tl_info; 365 info = tl_info;
318 stsi(info, 15, 1, 2); 366 store_topology(info);
319
320 nr_cores = info->mag[NR_MAG - 2];
321 for (i = 0; i < info->mnest - 2; i++)
322 nr_cores *= info->mag[NR_MAG - 3 - i];
323
324 pr_info("The CPU configuration topology of the machine is:"); 367 pr_info("The CPU configuration topology of the machine is:");
325 for (i = 0; i < NR_MAG; i++) 368 for (i = 0; i < NR_MAG; i++)
326 printk(" %d", info->mag[i]); 369 printk(" %d", info->mag[i]);
327 printk(" / %d\n", info->mnest); 370 printk(" / %d\n", info->mnest);
328 371 alloc_masks(info, &core_info, 2);
329 core = &core_info; 372#ifdef CONFIG_SCHED_BOOK
330 for (i = 0; i < nr_cores; i++) { 373 alloc_masks(info, &book_info, 3);
331 core->next = alloc_bootmem(sizeof(struct core_info)); 374#endif
332 core = core->next;
333 if (!core)
334 goto error;
335 }
336 return;
337error:
338 machine_has_topology = 0;
339} 375}
diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c
index 257de1f0692b..ae5bac39b896 100644
--- a/arch/sh/kernel/irq.c
+++ b/arch/sh/kernel/irq.c
@@ -290,7 +290,7 @@ void __init init_IRQ(void)
290int __init arch_probe_nr_irqs(void) 290int __init arch_probe_nr_irqs(void)
291{ 291{
292 nr_irqs = sh_mv.mv_nr_irqs; 292 nr_irqs = sh_mv.mv_nr_irqs;
293 return 0; 293 return NR_IRQS_LEGACY;
294} 294}
295#endif 295#endif
296 296
diff --git a/arch/sparc/kernel/pci_msi.c b/arch/sparc/kernel/pci_msi.c
index 548b8ca9c210..b210416ace7b 100644
--- a/arch/sparc/kernel/pci_msi.c
+++ b/arch/sparc/kernel/pci_msi.c
@@ -114,10 +114,10 @@ static void free_msi(struct pci_pbm_info *pbm, int msi_num)
114 114
115static struct irq_chip msi_irq = { 115static struct irq_chip msi_irq = {
116 .name = "PCI-MSI", 116 .name = "PCI-MSI",
117 .mask = mask_msi_irq, 117 .irq_mask = mask_msi_irq,
118 .unmask = unmask_msi_irq, 118 .irq_unmask = unmask_msi_irq,
119 .enable = unmask_msi_irq, 119 .irq_enable = unmask_msi_irq,
120 .disable = mask_msi_irq, 120 .irq_disable = mask_msi_irq,
121 /* XXX affinity XXX */ 121 /* XXX affinity XXX */
122}; 122};
123 123
diff --git a/arch/tile/kernel/irq.c b/arch/tile/kernel/irq.c
index 596c60086930..9a27d563fc30 100644
--- a/arch/tile/kernel/irq.c
+++ b/arch/tile/kernel/irq.c
@@ -208,7 +208,7 @@ static void tile_irq_chip_eoi(unsigned int irq)
208} 208}
209 209
210static struct irq_chip tile_irq_chip = { 210static struct irq_chip tile_irq_chip = {
211 .typename = "tile_irq_chip", 211 .name = "tile_irq_chip",
212 .ack = tile_irq_chip_ack, 212 .ack = tile_irq_chip_ack,
213 .eoi = tile_irq_chip_eoi, 213 .eoi = tile_irq_chip_eoi,
214 .mask = tile_irq_chip_mask, 214 .mask = tile_irq_chip_mask,
@@ -288,7 +288,7 @@ int show_interrupts(struct seq_file *p, void *v)
288 for_each_online_cpu(j) 288 for_each_online_cpu(j)
289 seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); 289 seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
290#endif 290#endif
291 seq_printf(p, " %14s", irq_desc[i].chip->typename); 291 seq_printf(p, " %14s", irq_desc[i].chip->name);
292 seq_printf(p, " %s", action->name); 292 seq_printf(p, " %s", action->name);
293 293
294 for (action = action->next; action; action = action->next) 294 for (action = action->next; action; action = action->next)
diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c
index a3f0b04d7101..a746e3037a5b 100644
--- a/arch/um/kernel/irq.c
+++ b/arch/um/kernel/irq.c
@@ -46,7 +46,7 @@ int show_interrupts(struct seq_file *p, void *v)
46 for_each_online_cpu(j) 46 for_each_online_cpu(j)
47 seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); 47 seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
48#endif 48#endif
49 seq_printf(p, " %14s", irq_desc[i].chip->typename); 49 seq_printf(p, " %14s", irq_desc[i].chip->name);
50 seq_printf(p, " %s", action->name); 50 seq_printf(p, " %s", action->name);
51 51
52 for (action=action->next; action; action = action->next) 52 for (action=action->next; action; action = action->next)
@@ -369,7 +369,7 @@ static void dummy(unsigned int irq)
369 369
370/* This is used for everything else than the timer. */ 370/* This is used for everything else than the timer. */
371static struct irq_chip normal_irq_type = { 371static struct irq_chip normal_irq_type = {
372 .typename = "SIGIO", 372 .name = "SIGIO",
373 .release = free_irq_by_irq_and_dev, 373 .release = free_irq_by_irq_and_dev,
374 .disable = dummy, 374 .disable = dummy,
375 .enable = dummy, 375 .enable = dummy,
@@ -378,7 +378,7 @@ static struct irq_chip normal_irq_type = {
378}; 378};
379 379
380static struct irq_chip SIGVTALRM_irq_type = { 380static struct irq_chip SIGVTALRM_irq_type = {
381 .typename = "SIGVTALRM", 381 .name = "SIGVTALRM",
382 .release = free_irq_by_irq_and_dev, 382 .release = free_irq_by_irq_and_dev,
383 .shutdown = dummy, /* never called */ 383 .shutdown = dummy, /* never called */
384 .disable = dummy, 384 .disable = dummy,
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index fd227d6b8d9c..7ab9db88ab6a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -63,6 +63,10 @@ config X86
63 select HAVE_USER_RETURN_NOTIFIER 63 select HAVE_USER_RETURN_NOTIFIER
64 select HAVE_ARCH_JUMP_LABEL 64 select HAVE_ARCH_JUMP_LABEL
65 select HAVE_TEXT_POKE_SMP 65 select HAVE_TEXT_POKE_SMP
66 select HAVE_GENERIC_HARDIRQS
67 select HAVE_SPARSE_IRQ
68 select GENERIC_IRQ_PROBE
69 select GENERIC_PENDING_IRQ if SMP
66 70
67config INSTRUCTION_DECODER 71config INSTRUCTION_DECODER
68 def_bool (KPROBES || PERF_EVENTS) 72 def_bool (KPROBES || PERF_EVENTS)
@@ -204,20 +208,6 @@ config HAVE_INTEL_TXT
204 def_bool y 208 def_bool y
205 depends on EXPERIMENTAL && DMAR && ACPI 209 depends on EXPERIMENTAL && DMAR && ACPI
206 210
207# Use the generic interrupt handling code in kernel/irq/:
208config GENERIC_HARDIRQS
209 def_bool y
210
211config GENERIC_HARDIRQS_NO__DO_IRQ
212 def_bool y
213
214config GENERIC_IRQ_PROBE
215 def_bool y
216
217config GENERIC_PENDING_IRQ
218 def_bool y
219 depends on GENERIC_HARDIRQS && SMP
220
221config USE_GENERIC_SMP_HELPERS 211config USE_GENERIC_SMP_HELPERS
222 def_bool y 212 def_bool y
223 depends on SMP 213 depends on SMP
@@ -300,23 +290,6 @@ config X86_X2APIC
300 290
301 If you don't know what to do here, say N. 291 If you don't know what to do here, say N.
302 292
303config SPARSE_IRQ
304 bool "Support sparse irq numbering"
305 depends on PCI_MSI || HT_IRQ
306 ---help---
307 This enables support for sparse irqs. This is useful for distro
308 kernels that want to define a high CONFIG_NR_CPUS value but still
309 want to have low kernel memory footprint on smaller machines.
310
311 ( Sparse IRQs can also be beneficial on NUMA boxes, as they spread
312 out the irq_desc[] array in a more NUMA-friendly way. )
313
314 If you don't know what to do here, say N.
315
316config NUMA_IRQ_DESC
317 def_bool y
318 depends on SPARSE_IRQ && NUMA
319
320config X86_MPPARSE 293config X86_MPPARSE
321 bool "Enable MPS table" if ACPI 294 bool "Enable MPS table" if ACPI
322 default y 295 default y
@@ -521,25 +494,6 @@ if PARAVIRT_GUEST
521 494
522source "arch/x86/xen/Kconfig" 495source "arch/x86/xen/Kconfig"
523 496
524config VMI
525 bool "VMI Guest support (DEPRECATED)"
526 select PARAVIRT
527 depends on X86_32
528 ---help---
529 VMI provides a paravirtualized interface to the VMware ESX server
530 (it could be used by other hypervisors in theory too, but is not
531 at the moment), by linking the kernel to a GPL-ed ROM module
532 provided by the hypervisor.
533
534 As of September 2009, VMware has started a phased retirement
535 of this feature from VMware's products. Please see
536 feature-removal-schedule.txt for details. If you are
537 planning to enable this option, please note that you cannot
538 live migrate a VMI enabled VM to a future VMware product,
539 which doesn't support VMI. So if you expect your kernel to
540 seamlessly migrate to newer VMware products, keep this
541 disabled.
542
543config KVM_CLOCK 497config KVM_CLOCK
544 bool "KVM paravirtualized clock" 498 bool "KVM paravirtualized clock"
545 select PARAVIRT 499 select PARAVIRT
@@ -674,7 +628,7 @@ config GART_IOMMU
674 bool "GART IOMMU support" if EMBEDDED 628 bool "GART IOMMU support" if EMBEDDED
675 default y 629 default y
676 select SWIOTLB 630 select SWIOTLB
677 depends on X86_64 && PCI && K8_NB 631 depends on X86_64 && PCI && AMD_NB
678 ---help--- 632 ---help---
679 Support for full DMA access of devices with 32bit memory access only 633 Support for full DMA access of devices with 32bit memory access only
680 on systems with more than 3GB. This is usually needed for USB, 634 on systems with more than 3GB. This is usually needed for USB,
@@ -799,6 +753,17 @@ config SCHED_MC
799 making when dealing with multi-core CPU chips at a cost of slightly 753 making when dealing with multi-core CPU chips at a cost of slightly
800 increased overhead in some places. If unsure say N here. 754 increased overhead in some places. If unsure say N here.
801 755
756config IRQ_TIME_ACCOUNTING
757 bool "Fine granularity task level IRQ time accounting"
758 default n
759 ---help---
760 Select this option to enable fine granularity task irq time
761 accounting. This is done by reading a timestamp on each
762 transitions between softirq and hardirq state, so there can be a
763 small performance impact.
764
765 If in doubt, say N here.
766
802source "kernel/Kconfig.preempt" 767source "kernel/Kconfig.preempt"
803 768
804config X86_UP_APIC 769config X86_UP_APIC
@@ -1152,6 +1117,9 @@ config X86_PAE
1152config ARCH_PHYS_ADDR_T_64BIT 1117config ARCH_PHYS_ADDR_T_64BIT
1153 def_bool X86_64 || X86_PAE 1118 def_bool X86_64 || X86_PAE
1154 1119
1120config ARCH_DMA_ADDR_T_64BIT
1121 def_bool X86_64 || HIGHMEM64G
1122
1155config DIRECT_GBPAGES 1123config DIRECT_GBPAGES
1156 bool "Enable 1GB pages for kernel pagetables" if EMBEDDED 1124 bool "Enable 1GB pages for kernel pagetables" if EMBEDDED
1157 default y 1125 default y
@@ -1330,25 +1298,34 @@ config X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
1330 Set whether the default state of memory_corruption_check is 1298 Set whether the default state of memory_corruption_check is
1331 on or off. 1299 on or off.
1332 1300
1333config X86_RESERVE_LOW_64K 1301config X86_RESERVE_LOW
1334 bool "Reserve low 64K of RAM on AMI/Phoenix BIOSen" 1302 int "Amount of low memory, in kilobytes, to reserve for the BIOS"
1335 default y 1303 default 64
1304 range 4 640
1336 ---help--- 1305 ---help---
1337 Reserve the first 64K of physical RAM on BIOSes that are known 1306 Specify the amount of low memory to reserve for the BIOS.
1338 to potentially corrupt that memory range. A numbers of BIOSes are 1307
1339 known to utilize this area during suspend/resume, so it must not 1308 The first page contains BIOS data structures that the kernel
1340 be used by the kernel. 1309 must not use, so that page must always be reserved.
1341 1310
1342 Set this to N if you are absolutely sure that you trust the BIOS 1311 By default we reserve the first 64K of physical RAM, as a
1343 to get all its memory reservations and usages right. 1312 number of BIOSes are known to corrupt that memory range
1313 during events such as suspend/resume or monitor cable
1314 insertion, so it must not be used by the kernel.
1344 1315
1345 If you have doubts about the BIOS (e.g. suspend/resume does not 1316 You can set this to 4 if you are absolutely sure that you
1346 work or there's kernel crashes after certain hardware hotplug 1317 trust the BIOS to get all its memory reservations and usages
1347 events) and it's not AMI or Phoenix, then you might want to enable 1318 right. If you know your BIOS have problems beyond the
1348 X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check typical 1319 default 64K area, you can set this to 640 to avoid using the
1349 corruption patterns. 1320 entire low memory range.
1350 1321
1351 Say Y if unsure. 1322 If you have doubts about the BIOS (e.g. suspend/resume does
1323 not work or there's kernel crashes after certain hardware
1324 hotplug events) then you might want to enable
1325 X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check
1326 typical corruption patterns.
1327
1328 Leave this to the default value of 64 if you are unsure.
1352 1329
1353config MATH_EMULATION 1330config MATH_EMULATION
1354 bool 1331 bool
@@ -1904,7 +1881,7 @@ config PCI_GODIRECT
1904 bool "Direct" 1881 bool "Direct"
1905 1882
1906config PCI_GOOLPC 1883config PCI_GOOLPC
1907 bool "OLPC" 1884 bool "OLPC XO-1"
1908 depends on OLPC 1885 depends on OLPC
1909 1886
1910config PCI_GOANY 1887config PCI_GOANY
@@ -2065,14 +2042,21 @@ config SCx200HR_TIMER
2065config OLPC 2042config OLPC
2066 bool "One Laptop Per Child support" 2043 bool "One Laptop Per Child support"
2067 select GPIOLIB 2044 select GPIOLIB
2045 select OLPC_OPENFIRMWARE
2068 ---help--- 2046 ---help---
2069 Add support for detecting the unique features of the OLPC 2047 Add support for detecting the unique features of the OLPC
2070 XO hardware. 2048 XO hardware.
2071 2049
2050config OLPC_XO1
2051 tristate "OLPC XO-1 support"
2052 depends on OLPC && PCI
2053 ---help---
2054 Add support for non-essential features of the OLPC XO-1 laptop.
2055
2072config OLPC_OPENFIRMWARE 2056config OLPC_OPENFIRMWARE
2073 bool "Support for OLPC's Open Firmware" 2057 bool "Support for OLPC's Open Firmware"
2074 depends on !X86_64 && !X86_PAE 2058 depends on !X86_64 && !X86_PAE
2075 default y if OLPC 2059 default n
2076 help 2060 help
2077 This option adds support for the implementation of Open Firmware 2061 This option adds support for the implementation of Open Firmware
2078 that is used on the OLPC XO-1 Children's Machine. 2062 that is used on the OLPC XO-1 Children's Machine.
@@ -2080,7 +2064,7 @@ config OLPC_OPENFIRMWARE
2080 2064
2081endif # X86_32 2065endif # X86_32
2082 2066
2083config K8_NB 2067config AMD_NB
2084 def_bool y 2068 def_bool y
2085 depends on CPU_SUP_AMD && PCI 2069 depends on CPU_SUP_AMD && PCI
2086 2070
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 75085080b63e..e5bb96b10f1a 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -43,6 +43,10 @@ config EARLY_PRINTK
43 with klogd/syslogd or the X server. You should normally N here, 43 with klogd/syslogd or the X server. You should normally N here,
44 unless you want to debug such a crash. 44 unless you want to debug such a crash.
45 45
46config EARLY_PRINTK_MRST
47 bool "Early printk for MRST platform support"
48 depends on EARLY_PRINTK && X86_MRST
49
46config EARLY_PRINTK_DBGP 50config EARLY_PRINTK_DBGP
47 bool "Early printk via EHCI debug port" 51 bool "Early printk via EHCI debug port"
48 depends on EARLY_PRINTK && PCI 52 depends on EARLY_PRINTK && PCI
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index e8c8881351b3..b02e509072a7 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -96,8 +96,12 @@ cfi := $(call as-instr,.cfi_startproc\n.cfi_rel_offset $(sp-y)$(comma)0\n.cfi_en
96# is .cfi_signal_frame supported too? 96# is .cfi_signal_frame supported too?
97cfi-sigframe := $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1) 97cfi-sigframe := $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1)
98cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTIONS=1) 98cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTIONS=1)
99KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) 99
100KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) 100# does binutils support specific instructions?
101asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1)
102
103KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr)
104KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr)
101 105
102LDFLAGS := -m elf_$(UTS_MACHINE) 106LDFLAGS := -m elf_$(UTS_MACHINE)
103 107
diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h
index 5af2982133b5..f16a2caca1e0 100644
--- a/arch/x86/include/asm/amd_iommu.h
+++ b/arch/x86/include/asm/amd_iommu.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. 2 * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com> 3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com> 4 * Leo Duran <leo.duran@amd.com>
5 * 5 *
diff --git a/arch/x86/include/asm/amd_iommu_proto.h b/arch/x86/include/asm/amd_iommu_proto.h
index cb030374b90a..916bc8111a01 100644
--- a/arch/x86/include/asm/amd_iommu_proto.h
+++ b/arch/x86/include/asm/amd_iommu_proto.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2009 Advanced Micro Devices, Inc. 2 * Copyright (C) 2009-2010 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com> 3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * 4 *
5 * This program is free software; you can redistribute it and/or modify it 5 * This program is free software; you can redistribute it and/or modify it
diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index 08616180deaf..e3509fc303bf 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. 2 * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com> 3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com> 4 * Leo Duran <leo.duran@amd.com>
5 * 5 *
@@ -416,13 +416,22 @@ struct amd_iommu {
416 struct dma_ops_domain *default_dom; 416 struct dma_ops_domain *default_dom;
417 417
418 /* 418 /*
419 * This array is required to work around a potential BIOS bug. 419 * We can't rely on the BIOS to restore all values on reinit, so we
420 * The BIOS may miss to restore parts of the PCI configuration 420 * need to stash them
421 * space when the system resumes from S3. The result is that the
422 * IOMMU does not execute commands anymore which leads to system
423 * failure.
424 */ 421 */
425 u32 cache_cfg[4]; 422
423 /* The iommu BAR */
424 u32 stored_addr_lo;
425 u32 stored_addr_hi;
426
427 /*
428 * Each iommu has 6 l1s, each of which is documented as having 0x12
429 * registers
430 */
431 u32 stored_l1[6][0x12];
432
433 /* The l2 indirect registers */
434 u32 stored_l2[0x83];
426}; 435};
427 436
428/* 437/*
diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/amd_nb.h
index af00bd1d2089..c8517f81b21e 100644
--- a/arch/x86/include/asm/k8.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -1,5 +1,5 @@
1#ifndef _ASM_X86_K8_H 1#ifndef _ASM_X86_AMD_NB_H
2#define _ASM_X86_K8_H 2#define _ASM_X86_AMD_NB_H
3 3
4#include <linux/pci.h> 4#include <linux/pci.h>
5 5
@@ -7,24 +7,27 @@ extern struct pci_device_id k8_nb_ids[];
7struct bootnode; 7struct bootnode;
8 8
9extern int early_is_k8_nb(u32 value); 9extern int early_is_k8_nb(u32 value);
10extern struct pci_dev **k8_northbridges;
11extern int num_k8_northbridges;
12extern int cache_k8_northbridges(void); 10extern int cache_k8_northbridges(void);
13extern void k8_flush_garts(void); 11extern void k8_flush_garts(void);
14extern int k8_get_nodes(struct bootnode *nodes); 12extern int k8_get_nodes(struct bootnode *nodes);
15extern int k8_numa_init(unsigned long start_pfn, unsigned long end_pfn); 13extern int k8_numa_init(unsigned long start_pfn, unsigned long end_pfn);
16extern int k8_scan_nodes(void); 14extern int k8_scan_nodes(void);
17 15
18#ifdef CONFIG_K8_NB 16struct k8_northbridge_info {
19extern int num_k8_northbridges; 17 u16 num;
18 u8 gart_supported;
19 struct pci_dev **nb_misc;
20};
21extern struct k8_northbridge_info k8_northbridges;
22
23#ifdef CONFIG_AMD_NB
20 24
21static inline struct pci_dev *node_to_k8_nb_misc(int node) 25static inline struct pci_dev *node_to_k8_nb_misc(int node)
22{ 26{
23 return (node < num_k8_northbridges) ? k8_northbridges[node] : NULL; 27 return (node < k8_northbridges.num) ? k8_northbridges.nb_misc[node] : NULL;
24} 28}
25 29
26#else 30#else
27#define num_k8_northbridges 0
28 31
29static inline struct pci_dev *node_to_k8_nb_misc(int node) 32static inline struct pci_dev *node_to_k8_nb_misc(int node)
30{ 33{
@@ -33,4 +36,4 @@ static inline struct pci_dev *node_to_k8_nb_misc(int node)
33#endif 36#endif
34 37
35 38
36#endif /* _ASM_X86_K8_H */ 39#endif /* _ASM_X86_AMD_NB_H */
diff --git a/arch/x86/include/asm/apb_timer.h b/arch/x86/include/asm/apb_timer.h
index a69b1ac9eaf8..2fefa501d3ba 100644
--- a/arch/x86/include/asm/apb_timer.h
+++ b/arch/x86/include/asm/apb_timer.h
@@ -54,7 +54,6 @@ extern struct clock_event_device *global_clock_event;
54extern unsigned long apbt_quick_calibrate(void); 54extern unsigned long apbt_quick_calibrate(void);
55extern int arch_setup_apbt_irqs(int irq, int trigger, int mask, int cpu); 55extern int arch_setup_apbt_irqs(int irq, int trigger, int mask, int cpu);
56extern void apbt_setup_secondary_clock(void); 56extern void apbt_setup_secondary_clock(void);
57extern unsigned int boot_cpu_id;
58 57
59extern struct sfi_timer_table_entry *sfi_get_mtmr(int hint); 58extern struct sfi_timer_table_entry *sfi_get_mtmr(int hint);
60extern void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr); 59extern void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr);
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 1fa03e04ae44..286de34b0ed6 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -252,9 +252,7 @@ static inline int apic_is_clustered_box(void)
252} 252}
253#endif 253#endif
254 254
255extern u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask); 255extern int setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask);
256extern u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask);
257
258 256
259#else /* !CONFIG_X86_LOCAL_APIC */ 257#else /* !CONFIG_X86_LOCAL_APIC */
260static inline void lapic_shutdown(void) { } 258static inline void lapic_shutdown(void) { }
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 7fe3b3060f08..a859ca461fb0 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -131,6 +131,7 @@
131#define APIC_EILVTn(n) (0x500 + 0x10 * n) 131#define APIC_EILVTn(n) (0x500 + 0x10 * n)
132#define APIC_EILVT_NR_AMD_K8 1 /* # of extended interrupts */ 132#define APIC_EILVT_NR_AMD_K8 1 /* # of extended interrupts */
133#define APIC_EILVT_NR_AMD_10H 4 133#define APIC_EILVT_NR_AMD_10H 4
134#define APIC_EILVT_NR_MAX APIC_EILVT_NR_AMD_10H
134#define APIC_EILVT_LVTOFF(x) (((x) >> 4) & 0xF) 135#define APIC_EILVT_LVTOFF(x) (((x) >> 4) & 0xF)
135#define APIC_EILVT_MSG_FIX 0x0 136#define APIC_EILVT_MSG_FIX 0x0
136#define APIC_EILVT_MSG_SMI 0x2 137#define APIC_EILVT_MSG_SMI 0x2
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index b185091bf19c..4fab24de26b1 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -32,6 +32,5 @@ extern void arch_unregister_cpu(int);
32 32
33DECLARE_PER_CPU(int, cpu_state); 33DECLARE_PER_CPU(int, cpu_state);
34 34
35extern unsigned int boot_cpu_id;
36 35
37#endif /* _ASM_X86_CPU_H */ 36#endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 3f76523589af..220e2ea08e80 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -152,10 +152,14 @@
152#define X86_FEATURE_3DNOWPREFETCH (6*32+ 8) /* 3DNow prefetch instructions */ 152#define X86_FEATURE_3DNOWPREFETCH (6*32+ 8) /* 3DNow prefetch instructions */
153#define X86_FEATURE_OSVW (6*32+ 9) /* OS Visible Workaround */ 153#define X86_FEATURE_OSVW (6*32+ 9) /* OS Visible Workaround */
154#define X86_FEATURE_IBS (6*32+10) /* Instruction Based Sampling */ 154#define X86_FEATURE_IBS (6*32+10) /* Instruction Based Sampling */
155#define X86_FEATURE_SSE5 (6*32+11) /* SSE-5 */ 155#define X86_FEATURE_XOP (6*32+11) /* extended AVX instructions */
156#define X86_FEATURE_SKINIT (6*32+12) /* SKINIT/STGI instructions */ 156#define X86_FEATURE_SKINIT (6*32+12) /* SKINIT/STGI instructions */
157#define X86_FEATURE_WDT (6*32+13) /* Watchdog timer */ 157#define X86_FEATURE_WDT (6*32+13) /* Watchdog timer */
158#define X86_FEATURE_LWP (6*32+15) /* Light Weight Profiling */
159#define X86_FEATURE_FMA4 (6*32+16) /* 4 operands MAC instructions */
158#define X86_FEATURE_NODEID_MSR (6*32+19) /* NodeId MSR */ 160#define X86_FEATURE_NODEID_MSR (6*32+19) /* NodeId MSR */
161#define X86_FEATURE_TBM (6*32+21) /* trailing bit manipulations */
162#define X86_FEATURE_TOPOEXT (6*32+22) /* topology extensions CPUID leafs */
159 163
160/* 164/*
161 * Auxiliary flags: Linux defined - For features scattered in various 165 * Auxiliary flags: Linux defined - For features scattered in various
@@ -180,6 +184,13 @@
180#define X86_FEATURE_LBRV (8*32+ 6) /* AMD LBR Virtualization support */ 184#define X86_FEATURE_LBRV (8*32+ 6) /* AMD LBR Virtualization support */
181#define X86_FEATURE_SVML (8*32+ 7) /* "svm_lock" AMD SVM locking MSR */ 185#define X86_FEATURE_SVML (8*32+ 7) /* "svm_lock" AMD SVM locking MSR */
182#define X86_FEATURE_NRIPS (8*32+ 8) /* "nrip_save" AMD SVM next_rip save */ 186#define X86_FEATURE_NRIPS (8*32+ 8) /* "nrip_save" AMD SVM next_rip save */
187#define X86_FEATURE_TSCRATEMSR (8*32+ 9) /* "tsc_scale" AMD TSC scaling support */
188#define X86_FEATURE_VMCBCLEAN (8*32+10) /* "vmcb_clean" AMD VMCB clean bits support */
189#define X86_FEATURE_FLUSHBYASID (8*32+11) /* AMD flush-by-ASID support */
190#define X86_FEATURE_DECODEASSISTS (8*32+12) /* AMD Decode Assists support */
191#define X86_FEATURE_PAUSEFILTER (8*32+13) /* AMD filtered pause intercept */
192#define X86_FEATURE_PFTHRESHOLD (8*32+14) /* AMD pause filter threshold */
193
183 194
184/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ 195/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
185#define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ 196#define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h
index 733f7e91e7a9..326099199318 100644
--- a/arch/x86/include/asm/dwarf2.h
+++ b/arch/x86/include/asm/dwarf2.h
@@ -89,6 +89,16 @@
89 CFI_ADJUST_CFA_OFFSET -8 89 CFI_ADJUST_CFA_OFFSET -8
90 .endm 90 .endm
91 91
92 .macro pushfq_cfi
93 pushfq
94 CFI_ADJUST_CFA_OFFSET 8
95 .endm
96
97 .macro popfq_cfi
98 popfq
99 CFI_ADJUST_CFA_OFFSET -8
100 .endm
101
92 .macro movq_cfi reg offset=0 102 .macro movq_cfi reg offset=0
93 movq %\reg, \offset(%rsp) 103 movq %\reg, \offset(%rsp)
94 CFI_REL_OFFSET \reg, \offset 104 CFI_REL_OFFSET \reg, \offset
@@ -109,6 +119,16 @@
109 CFI_ADJUST_CFA_OFFSET -4 119 CFI_ADJUST_CFA_OFFSET -4
110 .endm 120 .endm
111 121
122 .macro pushfl_cfi
123 pushfl
124 CFI_ADJUST_CFA_OFFSET 4
125 .endm
126
127 .macro popfl_cfi
128 popfl
129 CFI_ADJUST_CFA_OFFSET -4
130 .endm
131
112 .macro movl_cfi reg offset=0 132 .macro movl_cfi reg offset=0
113 movl %\reg, \offset(%esp) 133 movl %\reg, \offset(%esp)
114 CFI_REL_OFFSET \reg, \offset 134 CFI_REL_OFFSET \reg, \offset
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index d07b44f7d1dc..4d293dced62f 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -214,5 +214,20 @@ static inline unsigned long virt_to_fix(const unsigned long vaddr)
214 BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START); 214 BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
215 return __virt_to_fix(vaddr); 215 return __virt_to_fix(vaddr);
216} 216}
217
218/* Return an pointer with offset calculated */
219static inline unsigned long __set_fixmap_offset(enum fixed_addresses idx,
220 phys_addr_t phys, pgprot_t flags)
221{
222 __set_fixmap(idx, phys, flags);
223 return fix_to_virt(idx) + (phys & (PAGE_SIZE - 1));
224}
225
226#define set_fixmap_offset(idx, phys) \
227 __set_fixmap_offset(idx, phys, PAGE_KERNEL)
228
229#define set_fixmap_offset_nocache(idx, phys) \
230 __set_fixmap_offset(idx, phys, PAGE_KERNEL_NOCACHE)
231
217#endif /* !__ASSEMBLY__ */ 232#endif /* !__ASSEMBLY__ */
218#endif /* _ASM_X86_FIXMAP_H */ 233#endif /* _ASM_X86_FIXMAP_H */
diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h
index 4ac5b0f33fc1..bf357f9b25f0 100644
--- a/arch/x86/include/asm/gart.h
+++ b/arch/x86/include/asm/gart.h
@@ -17,6 +17,7 @@ extern int fix_aperture;
17#define GARTEN (1<<0) 17#define GARTEN (1<<0)
18#define DISGARTCPU (1<<4) 18#define DISGARTCPU (1<<4)
19#define DISGARTIO (1<<5) 19#define DISGARTIO (1<<5)
20#define DISTLBWALKPRB (1<<6)
20 21
21/* GART cache control register bits. */ 22/* GART cache control register bits. */
22#define INVGART (1<<0) 23#define INVGART (1<<0)
@@ -27,7 +28,6 @@ extern int fix_aperture;
27#define AMD64_GARTAPERTUREBASE 0x94 28#define AMD64_GARTAPERTUREBASE 0x94
28#define AMD64_GARTTABLEBASE 0x98 29#define AMD64_GARTTABLEBASE 0x98
29#define AMD64_GARTCACHECTL 0x9c 30#define AMD64_GARTCACHECTL 0x9c
30#define AMD64_GARTEN (1<<0)
31 31
32#ifdef CONFIG_GART_IOMMU 32#ifdef CONFIG_GART_IOMMU
33extern int gart_iommu_aperture; 33extern int gart_iommu_aperture;
@@ -57,6 +57,19 @@ static inline void gart_iommu_hole_init(void)
57 57
58extern int agp_amd64_init(void); 58extern int agp_amd64_init(void);
59 59
60static inline void gart_set_size_and_enable(struct pci_dev *dev, u32 order)
61{
62 u32 ctl;
63
64 /*
65 * Don't enable translation but enable GART IO and CPU accesses.
66 * Also, set DISTLBWALKPRB since GART tables memory is UC.
67 */
68 ctl = DISTLBWALKPRB | order << 1;
69
70 pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl);
71}
72
60static inline void enable_gart_translation(struct pci_dev *dev, u64 addr) 73static inline void enable_gart_translation(struct pci_dev *dev, u64 addr)
61{ 74{
62 u32 tmp, ctl; 75 u32 tmp, ctl;
diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h
index 1d5c08a1bdfd..2c392d663dce 100644
--- a/arch/x86/include/asm/hpet.h
+++ b/arch/x86/include/asm/hpet.h
@@ -74,10 +74,12 @@ extern void hpet_disable(void);
74extern unsigned int hpet_readl(unsigned int a); 74extern unsigned int hpet_readl(unsigned int a);
75extern void force_hpet_resume(void); 75extern void force_hpet_resume(void);
76 76
77extern void hpet_msi_unmask(unsigned int irq); 77struct irq_data;
78extern void hpet_msi_mask(unsigned int irq); 78extern void hpet_msi_unmask(struct irq_data *data);
79extern void hpet_msi_write(unsigned int irq, struct msi_msg *msg); 79extern void hpet_msi_mask(struct irq_data *data);
80extern void hpet_msi_read(unsigned int irq, struct msi_msg *msg); 80struct hpet_dev;
81extern void hpet_msi_write(struct hpet_dev *hdev, struct msi_msg *msg);
82extern void hpet_msi_read(struct hpet_dev *hdev, struct msi_msg *msg);
81 83
82#ifdef CONFIG_PCI_MSI 84#ifdef CONFIG_PCI_MSI
83extern int arch_setup_hpet_msi(unsigned int irq, unsigned int id); 85extern int arch_setup_hpet_msi(unsigned int irq, unsigned int id);
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 3a54a1ca1a02..0274ec5a7e62 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -78,6 +78,13 @@ static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr,
78 irq_attr->polarity = polarity; 78 irq_attr->polarity = polarity;
79} 79}
80 80
81struct irq_2_iommu {
82 struct intel_iommu *iommu;
83 u16 irte_index;
84 u16 sub_handle;
85 u8 irte_mask;
86};
87
81/* 88/*
82 * This is performance-critical, we want to do it O(1) 89 * This is performance-critical, we want to do it O(1)
83 * 90 *
@@ -89,15 +96,17 @@ struct irq_cfg {
89 cpumask_var_t old_domain; 96 cpumask_var_t old_domain;
90 u8 vector; 97 u8 vector;
91 u8 move_in_progress : 1; 98 u8 move_in_progress : 1;
99#ifdef CONFIG_INTR_REMAP
100 struct irq_2_iommu irq_2_iommu;
101#endif
92}; 102};
93 103
94extern struct irq_cfg *irq_cfg(unsigned int);
95extern int assign_irq_vector(int, struct irq_cfg *, const struct cpumask *); 104extern int assign_irq_vector(int, struct irq_cfg *, const struct cpumask *);
96extern void send_cleanup_vector(struct irq_cfg *); 105extern void send_cleanup_vector(struct irq_cfg *);
97 106
98struct irq_desc; 107struct irq_data;
99extern unsigned int set_desc_affinity(struct irq_desc *, const struct cpumask *, 108int __ioapic_set_affinity(struct irq_data *, const struct cpumask *,
100 unsigned int *dest_id); 109 unsigned int *dest_id);
101extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin, struct io_apic_irq_attr *irq_attr); 110extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin, struct io_apic_irq_attr *irq_attr);
102extern void setup_ioapic_dest(void); 111extern void setup_ioapic_dest(void);
103 112
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index a73a8d5a5e69..4aa2bb3b242a 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -55,6 +55,12 @@ extern int save_i387_xstate_ia32(void __user *buf);
55extern int restore_i387_xstate_ia32(void __user *buf); 55extern int restore_i387_xstate_ia32(void __user *buf);
56#endif 56#endif
57 57
58#ifdef CONFIG_MATH_EMULATION
59extern void finit_soft_fpu(struct i387_soft_struct *soft);
60#else
61static inline void finit_soft_fpu(struct i387_soft_struct *soft) {}
62#endif
63
58#define X87_FSW_ES (1 << 7) /* Exception Summary */ 64#define X87_FSW_ES (1 << 7) /* Exception Summary */
59 65
60static __always_inline __pure bool use_xsaveopt(void) 66static __always_inline __pure bool use_xsaveopt(void)
@@ -67,6 +73,11 @@ static __always_inline __pure bool use_xsave(void)
67 return static_cpu_has(X86_FEATURE_XSAVE); 73 return static_cpu_has(X86_FEATURE_XSAVE);
68} 74}
69 75
76static __always_inline __pure bool use_fxsr(void)
77{
78 return static_cpu_has(X86_FEATURE_FXSR);
79}
80
70extern void __sanitize_i387_state(struct task_struct *); 81extern void __sanitize_i387_state(struct task_struct *);
71 82
72static inline void sanitize_i387_state(struct task_struct *tsk) 83static inline void sanitize_i387_state(struct task_struct *tsk)
@@ -77,19 +88,11 @@ static inline void sanitize_i387_state(struct task_struct *tsk)
77} 88}
78 89
79#ifdef CONFIG_X86_64 90#ifdef CONFIG_X86_64
80
81/* Ignore delayed exceptions from user space */
82static inline void tolerant_fwait(void)
83{
84 asm volatile("1: fwait\n"
85 "2:\n"
86 _ASM_EXTABLE(1b, 2b));
87}
88
89static inline int fxrstor_checking(struct i387_fxsave_struct *fx) 91static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
90{ 92{
91 int err; 93 int err;
92 94
95 /* See comment in fxsave() below. */
93 asm volatile("1: rex64/fxrstor (%[fx])\n\t" 96 asm volatile("1: rex64/fxrstor (%[fx])\n\t"
94 "2:\n" 97 "2:\n"
95 ".section .fixup,\"ax\"\n" 98 ".section .fixup,\"ax\"\n"
@@ -98,44 +101,10 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
98 ".previous\n" 101 ".previous\n"
99 _ASM_EXTABLE(1b, 3b) 102 _ASM_EXTABLE(1b, 3b)
100 : [err] "=r" (err) 103 : [err] "=r" (err)
101#if 0 /* See comment in fxsave() below. */ 104 : [fx] "R" (fx), "m" (*fx), "0" (0));
102 : [fx] "r" (fx), "m" (*fx), "0" (0));
103#else
104 : [fx] "cdaSDb" (fx), "m" (*fx), "0" (0));
105#endif
106 return err; 105 return err;
107} 106}
108 107
109/* AMD CPUs don't save/restore FDP/FIP/FOP unless an exception
110 is pending. Clear the x87 state here by setting it to fixed
111 values. The kernel data segment can be sometimes 0 and sometimes
112 new user value. Both should be ok.
113 Use the PDA as safe address because it should be already in L1. */
114static inline void fpu_clear(struct fpu *fpu)
115{
116 struct xsave_struct *xstate = &fpu->state->xsave;
117 struct i387_fxsave_struct *fx = &fpu->state->fxsave;
118
119 /*
120 * xsave header may indicate the init state of the FP.
121 */
122 if (use_xsave() &&
123 !(xstate->xsave_hdr.xstate_bv & XSTATE_FP))
124 return;
125
126 if (unlikely(fx->swd & X87_FSW_ES))
127 asm volatile("fnclex");
128 alternative_input(ASM_NOP8 ASM_NOP2,
129 " emms\n" /* clear stack tags */
130 " fildl %%gs:0", /* load to clear state */
131 X86_FEATURE_FXSAVE_LEAK);
132}
133
134static inline void clear_fpu_state(struct task_struct *tsk)
135{
136 fpu_clear(&tsk->thread.fpu);
137}
138
139static inline int fxsave_user(struct i387_fxsave_struct __user *fx) 108static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
140{ 109{
141 int err; 110 int err;
@@ -149,6 +118,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
149 if (unlikely(err)) 118 if (unlikely(err))
150 return -EFAULT; 119 return -EFAULT;
151 120
121 /* See comment in fxsave() below. */
152 asm volatile("1: rex64/fxsave (%[fx])\n\t" 122 asm volatile("1: rex64/fxsave (%[fx])\n\t"
153 "2:\n" 123 "2:\n"
154 ".section .fixup,\"ax\"\n" 124 ".section .fixup,\"ax\"\n"
@@ -157,11 +127,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
157 ".previous\n" 127 ".previous\n"
158 _ASM_EXTABLE(1b, 3b) 128 _ASM_EXTABLE(1b, 3b)
159 : [err] "=r" (err), "=m" (*fx) 129 : [err] "=r" (err), "=m" (*fx)
160#if 0 /* See comment in fxsave() below. */ 130 : [fx] "R" (fx), "0" (0));
161 : [fx] "r" (fx), "0" (0));
162#else
163 : [fx] "cdaSDb" (fx), "0" (0));
164#endif
165 if (unlikely(err) && 131 if (unlikely(err) &&
166 __clear_user(fx, sizeof(struct i387_fxsave_struct))) 132 __clear_user(fx, sizeof(struct i387_fxsave_struct)))
167 err = -EFAULT; 133 err = -EFAULT;
@@ -175,56 +141,29 @@ static inline void fpu_fxsave(struct fpu *fpu)
175 uses any extended registers for addressing, a second REX prefix 141 uses any extended registers for addressing, a second REX prefix
176 will be generated (to the assembler, rex64 followed by semicolon 142 will be generated (to the assembler, rex64 followed by semicolon
177 is a separate instruction), and hence the 64-bitness is lost. */ 143 is a separate instruction), and hence the 64-bitness is lost. */
178#if 0 144
145#ifdef CONFIG_AS_FXSAVEQ
179 /* Using "fxsaveq %0" would be the ideal choice, but is only supported 146 /* Using "fxsaveq %0" would be the ideal choice, but is only supported
180 starting with gas 2.16. */ 147 starting with gas 2.16. */
181 __asm__ __volatile__("fxsaveq %0" 148 __asm__ __volatile__("fxsaveq %0"
182 : "=m" (fpu->state->fxsave)); 149 : "=m" (fpu->state->fxsave));
183#elif 0 150#else
184 /* Using, as a workaround, the properly prefixed form below isn't 151 /* Using, as a workaround, the properly prefixed form below isn't
185 accepted by any binutils version so far released, complaining that 152 accepted by any binutils version so far released, complaining that
186 the same type of prefix is used twice if an extended register is 153 the same type of prefix is used twice if an extended register is
187 needed for addressing (fix submitted to mainline 2005-11-21). */ 154 needed for addressing (fix submitted to mainline 2005-11-21).
188 __asm__ __volatile__("rex64/fxsave %0" 155 asm volatile("rex64/fxsave %0"
189 : "=m" (fpu->state->fxsave)); 156 : "=m" (fpu->state->fxsave));
190#else 157 This, however, we can work around by forcing the compiler to select
191 /* This, however, we can work around by forcing the compiler to select
192 an addressing mode that doesn't require extended registers. */ 158 an addressing mode that doesn't require extended registers. */
193 __asm__ __volatile__("rex64/fxsave (%1)" 159 asm volatile("rex64/fxsave (%[fx])"
194 : "=m" (fpu->state->fxsave) 160 : "=m" (fpu->state->fxsave)
195 : "cdaSDb" (&fpu->state->fxsave)); 161 : [fx] "R" (&fpu->state->fxsave));
196#endif 162#endif
197} 163}
198 164
199static inline void fpu_save_init(struct fpu *fpu)
200{
201 if (use_xsave())
202 fpu_xsave(fpu);
203 else
204 fpu_fxsave(fpu);
205
206 fpu_clear(fpu);
207}
208
209static inline void __save_init_fpu(struct task_struct *tsk)
210{
211 fpu_save_init(&tsk->thread.fpu);
212 task_thread_info(tsk)->status &= ~TS_USEDFPU;
213}
214
215#else /* CONFIG_X86_32 */ 165#else /* CONFIG_X86_32 */
216 166
217#ifdef CONFIG_MATH_EMULATION
218extern void finit_soft_fpu(struct i387_soft_struct *soft);
219#else
220static inline void finit_soft_fpu(struct i387_soft_struct *soft) {}
221#endif
222
223static inline void tolerant_fwait(void)
224{
225 asm volatile("fnclex ; fwait");
226}
227
228/* perform fxrstor iff the processor has extended states, otherwise frstor */ 167/* perform fxrstor iff the processor has extended states, otherwise frstor */
229static inline int fxrstor_checking(struct i387_fxsave_struct *fx) 168static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
230{ 169{
@@ -241,6 +180,14 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
241 return 0; 180 return 0;
242} 181}
243 182
183static inline void fpu_fxsave(struct fpu *fpu)
184{
185 asm volatile("fxsave %[fx]"
186 : [fx] "=m" (fpu->state->fxsave));
187}
188
189#endif /* CONFIG_X86_64 */
190
244/* We need a safe address that is cheap to find and that is already 191/* We need a safe address that is cheap to find and that is already
245 in L1 during context switch. The best choices are unfortunately 192 in L1 during context switch. The best choices are unfortunately
246 different for UP and SMP */ 193 different for UP and SMP */
@@ -256,47 +203,33 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
256static inline void fpu_save_init(struct fpu *fpu) 203static inline void fpu_save_init(struct fpu *fpu)
257{ 204{
258 if (use_xsave()) { 205 if (use_xsave()) {
259 struct xsave_struct *xstate = &fpu->state->xsave;
260 struct i387_fxsave_struct *fx = &fpu->state->fxsave;
261
262 fpu_xsave(fpu); 206 fpu_xsave(fpu);
263 207
264 /* 208 /*
265 * xsave header may indicate the init state of the FP. 209 * xsave header may indicate the init state of the FP.
266 */ 210 */
267 if (!(xstate->xsave_hdr.xstate_bv & XSTATE_FP)) 211 if (!(fpu->state->xsave.xsave_hdr.xstate_bv & XSTATE_FP))
268 goto end; 212 return;
269 213 } else if (use_fxsr()) {
270 if (unlikely(fx->swd & X87_FSW_ES)) 214 fpu_fxsave(fpu);
271 asm volatile("fnclex"); 215 } else {
272 216 asm volatile("fsave %[fx]; fwait"
273 /* 217 : [fx] "=m" (fpu->state->fsave));
274 * we can do a simple return here or be paranoid :) 218 return;
275 */
276 goto clear_state;
277 } 219 }
278 220
279 /* Use more nops than strictly needed in case the compiler 221 if (unlikely(fpu->state->fxsave.swd & X87_FSW_ES))
280 varies code */ 222 asm volatile("fnclex");
281 alternative_input( 223
282 "fnsave %[fx] ;fwait;" GENERIC_NOP8 GENERIC_NOP4,
283 "fxsave %[fx]\n"
284 "bt $7,%[fsw] ; jnc 1f ; fnclex\n1:",
285 X86_FEATURE_FXSR,
286 [fx] "m" (fpu->state->fxsave),
287 [fsw] "m" (fpu->state->fxsave.swd) : "memory");
288clear_state:
289 /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception 224 /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
290 is pending. Clear the x87 state here by setting it to fixed 225 is pending. Clear the x87 state here by setting it to fixed
291 values. safe_address is a random variable that should be in L1 */ 226 values. safe_address is a random variable that should be in L1 */
292 alternative_input( 227 alternative_input(
293 GENERIC_NOP8 GENERIC_NOP2, 228 ASM_NOP8 ASM_NOP2,
294 "emms\n\t" /* clear stack tags */ 229 "emms\n\t" /* clear stack tags */
295 "fildl %[addr]", /* set F?P to defined value */ 230 "fildl %P[addr]", /* set F?P to defined value */
296 X86_FEATURE_FXSAVE_LEAK, 231 X86_FEATURE_FXSAVE_LEAK,
297 [addr] "m" (safe_address)); 232 [addr] "m" (safe_address));
298end:
299 ;
300} 233}
301 234
302static inline void __save_init_fpu(struct task_struct *tsk) 235static inline void __save_init_fpu(struct task_struct *tsk)
@@ -305,9 +238,6 @@ static inline void __save_init_fpu(struct task_struct *tsk)
305 task_thread_info(tsk)->status &= ~TS_USEDFPU; 238 task_thread_info(tsk)->status &= ~TS_USEDFPU;
306} 239}
307 240
308
309#endif /* CONFIG_X86_64 */
310
311static inline int fpu_fxrstor_checking(struct fpu *fpu) 241static inline int fpu_fxrstor_checking(struct fpu *fpu)
312{ 242{
313 return fxrstor_checking(&fpu->state->fxsave); 243 return fxrstor_checking(&fpu->state->fxsave);
@@ -344,7 +274,10 @@ static inline void __unlazy_fpu(struct task_struct *tsk)
344static inline void __clear_fpu(struct task_struct *tsk) 274static inline void __clear_fpu(struct task_struct *tsk)
345{ 275{
346 if (task_thread_info(tsk)->status & TS_USEDFPU) { 276 if (task_thread_info(tsk)->status & TS_USEDFPU) {
347 tolerant_fwait(); 277 /* Ignore delayed exceptions from user space */
278 asm volatile("1: fwait\n"
279 "2:\n"
280 _ASM_EXTABLE(1b, 2b));
348 task_thread_info(tsk)->status &= ~TS_USEDFPU; 281 task_thread_info(tsk)->status &= ~TS_USEDFPU;
349 stts(); 282 stts();
350 } 283 }
@@ -405,19 +338,6 @@ static inline void irq_ts_restore(int TS_state)
405 stts(); 338 stts();
406} 339}
407 340
408#ifdef CONFIG_X86_64
409
410static inline void save_init_fpu(struct task_struct *tsk)
411{
412 __save_init_fpu(tsk);
413 stts();
414}
415
416#define unlazy_fpu __unlazy_fpu
417#define clear_fpu __clear_fpu
418
419#else /* CONFIG_X86_32 */
420
421/* 341/*
422 * These disable preemption on their own and are safe 342 * These disable preemption on their own and are safe
423 */ 343 */
@@ -443,8 +363,6 @@ static inline void clear_fpu(struct task_struct *tsk)
443 preempt_enable(); 363 preempt_enable();
444} 364}
445 365
446#endif /* CONFIG_X86_64 */
447
448/* 366/*
449 * i387 state interaction 367 * i387 state interaction
450 */ 368 */
@@ -508,7 +426,4 @@ extern void fpu_finit(struct fpu *fpu);
508 426
509#endif /* __ASSEMBLY__ */ 427#endif /* __ASSEMBLY__ */
510 428
511#define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5
512#define PSHUFB_XMM5_XMM6 .byte 0x66, 0x0f, 0x38, 0x00, 0xf5
513
514#endif /* _ASM_X86_I387_H */ 429#endif /* _ASM_X86_I387_H */
diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h
index 1655147646aa..a20365953bf8 100644
--- a/arch/x86/include/asm/i8259.h
+++ b/arch/x86/include/asm/i8259.h
@@ -55,6 +55,8 @@ extern struct irq_chip i8259A_chip;
55struct legacy_pic { 55struct legacy_pic {
56 int nr_legacy_irqs; 56 int nr_legacy_irqs;
57 struct irq_chip *chip; 57 struct irq_chip *chip;
58 void (*mask)(unsigned int irq);
59 void (*unmask)(unsigned int irq);
58 void (*mask_all)(void); 60 void (*mask_all)(void);
59 void (*restore_mask)(void); 61 void (*restore_mask)(void);
60 void (*init)(int auto_eoi); 62 void (*init)(int auto_eoi);
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 30a3e9776123..6a45ec41ec26 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -206,6 +206,7 @@ static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
206 206
207extern void iounmap(volatile void __iomem *addr); 207extern void iounmap(volatile void __iomem *addr);
208 208
209extern void set_iounmap_nonlazy(void);
209 210
210#ifdef __KERNEL__ 211#ifdef __KERNEL__
211 212
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 9cb2edb87c2f..c8be4566c3d2 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -170,12 +170,6 @@ extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
170 170
171extern void probe_nr_irqs_gsi(void); 171extern void probe_nr_irqs_gsi(void);
172 172
173extern int setup_ioapic_entry(int apic, int irq,
174 struct IO_APIC_route_entry *entry,
175 unsigned int destination, int trigger,
176 int polarity, int vector, int pin);
177extern void ioapic_write_entry(int apic, int pin,
178 struct IO_APIC_route_entry e);
179extern void setup_ioapic_ids_from_mpc(void); 173extern void setup_ioapic_ids_from_mpc(void);
180 174
181struct mp_ioapic_gsi{ 175struct mp_ioapic_gsi{
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index f275e2244505..1c23360fb2d8 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -3,4 +3,39 @@
3 3
4#define IRTE_DEST(dest) ((x2apic_mode) ? dest : dest << 8) 4#define IRTE_DEST(dest) ((x2apic_mode) ? dest : dest << 8)
5 5
6#ifdef CONFIG_INTR_REMAP
7static inline void prepare_irte(struct irte *irte, int vector,
8 unsigned int dest)
9{
10 memset(irte, 0, sizeof(*irte));
11
12 irte->present = 1;
13 irte->dst_mode = apic->irq_dest_mode;
14 /*
15 * Trigger mode in the IRTE will always be edge, and for IO-APIC, the
16 * actual level or edge trigger will be setup in the IO-APIC
17 * RTE. This will help simplify level triggered irq migration.
18 * For more details, see the comments (in io_apic.c) explainig IO-APIC
19 * irq migration in the presence of interrupt-remapping.
20 */
21 irte->trigger_mode = 0;
22 irte->dlvry_mode = apic->irq_delivery_mode;
23 irte->vector = vector;
24 irte->dest_id = IRTE_DEST(dest);
25 irte->redir_hint = 1;
26}
27static inline bool irq_remapped(struct irq_cfg *cfg)
28{
29 return cfg->irq_2_iommu.iommu != NULL;
30}
31#else
32static void prepare_irte(struct irte *irte, int vector, unsigned int dest)
33{
34}
35static inline bool irq_remapped(struct irq_cfg *cfg)
36{
37 return false;
38}
39#endif
40
6#endif /* _ASM_X86_IRQ_REMAPPING_H */ 41#endif /* _ASM_X86_IRQ_REMAPPING_H */
diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h
index 16350740edf6..4a711a684b17 100644
--- a/arch/x86/include/asm/mrst.h
+++ b/arch/x86/include/asm/mrst.h
@@ -10,6 +10,9 @@
10 */ 10 */
11#ifndef _ASM_X86_MRST_H 11#ifndef _ASM_X86_MRST_H
12#define _ASM_X86_MRST_H 12#define _ASM_X86_MRST_H
13
14#include <linux/sfi.h>
15
13extern int pci_mrst_init(void); 16extern int pci_mrst_init(void);
14int __init sfi_parse_mrtc(struct sfi_table_header *table); 17int __init sfi_parse_mrtc(struct sfi_table_header *table);
15 18
@@ -26,7 +29,7 @@ enum mrst_cpu_type {
26}; 29};
27 30
28extern enum mrst_cpu_type __mrst_cpu_chip; 31extern enum mrst_cpu_type __mrst_cpu_chip;
29static enum mrst_cpu_type mrst_identify_cpu(void) 32static inline enum mrst_cpu_type mrst_identify_cpu(void)
30{ 33{
31 return __mrst_cpu_chip; 34 return __mrst_cpu_chip;
32} 35}
@@ -42,4 +45,9 @@ extern enum mrst_timer_options mrst_timer_options;
42#define SFI_MTMR_MAX_NUM 8 45#define SFI_MTMR_MAX_NUM 8
43#define SFI_MRTC_MAX 8 46#define SFI_MRTC_MAX 8
44 47
48extern struct console early_mrst_console;
49extern void mrst_early_console_init(void);
50
51extern struct console early_hsu_console;
52extern void hsu_early_console_init(void);
45#endif /* _ASM_X86_MRST_H */ 53#endif /* _ASM_X86_MRST_H */
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
new file mode 100644
index 000000000000..bcdff997668c
--- /dev/null
+++ b/arch/x86/include/asm/mwait.h
@@ -0,0 +1,15 @@
1#ifndef _ASM_X86_MWAIT_H
2#define _ASM_X86_MWAIT_H
3
4#define MWAIT_SUBSTATE_MASK 0xf
5#define MWAIT_CSTATE_MASK 0xf
6#define MWAIT_SUBSTATE_SIZE 4
7#define MWAIT_MAX_NUM_CSTATES 8
8
9#define CPUID_MWAIT_LEAF 5
10#define CPUID5_ECX_EXTENSIONS_SUPPORTED 0x1
11#define CPUID5_ECX_INTERRUPT_BREAK 0x2
12
13#define MWAIT_ECX_INTERRUPT_BREAK 0x1
14
15#endif /* _ASM_X86_MWAIT_H */
diff --git a/arch/x86/include/asm/olpc_ofw.h b/arch/x86/include/asm/olpc_ofw.h
index 08fde475cb3b..2a8478140bb3 100644
--- a/arch/x86/include/asm/olpc_ofw.h
+++ b/arch/x86/include/asm/olpc_ofw.h
@@ -21,10 +21,14 @@ extern void olpc_ofw_detect(void);
21/* install OFW's pde permanently into the kernel's pgtable */ 21/* install OFW's pde permanently into the kernel's pgtable */
22extern void setup_olpc_ofw_pgd(void); 22extern void setup_olpc_ofw_pgd(void);
23 23
24/* check if OFW was detected during boot */
25extern bool olpc_ofw_present(void);
26
24#else /* !CONFIG_OLPC_OPENFIRMWARE */ 27#else /* !CONFIG_OLPC_OPENFIRMWARE */
25 28
26static inline void olpc_ofw_detect(void) { } 29static inline void olpc_ofw_detect(void) { }
27static inline void setup_olpc_ofw_pgd(void) { } 30static inline void setup_olpc_ofw_pgd(void) { }
31static inline bool olpc_ofw_present(void) { return false; }
28 32
29#endif /* !CONFIG_OLPC_OPENFIRMWARE */ 33#endif /* !CONFIG_OLPC_OPENFIRMWARE */
30 34
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index a667f24c7254..1df66211fd1b 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -8,7 +8,7 @@
8#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) 8#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
9#define PAGE_MASK (~(PAGE_SIZE-1)) 9#define PAGE_MASK (~(PAGE_SIZE-1))
10 10
11#define __PHYSICAL_MASK ((phys_addr_t)(1ULL << __PHYSICAL_MASK_SHIFT) - 1) 11#define __PHYSICAL_MASK ((phys_addr_t)((1ULL << __PHYSICAL_MASK_SHIFT) - 1))
12#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1) 12#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
13 13
14/* Cast PAGE_MASK to a signed type so that it is sign-extended if 14/* Cast PAGE_MASK to a signed type so that it is sign-extended if
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 5653f43d90e5..edecb4ed2210 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -416,11 +416,6 @@ static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
416 PVOP_VCALL2(pv_mmu_ops.alloc_pmd, mm, pfn); 416 PVOP_VCALL2(pv_mmu_ops.alloc_pmd, mm, pfn);
417} 417}
418 418
419static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn,
420 unsigned long start, unsigned long count)
421{
422 PVOP_VCALL4(pv_mmu_ops.alloc_pmd_clone, pfn, clonepfn, start, count);
423}
424static inline void paravirt_release_pmd(unsigned long pfn) 419static inline void paravirt_release_pmd(unsigned long pfn)
425{ 420{
426 PVOP_VCALL1(pv_mmu_ops.release_pmd, pfn); 421 PVOP_VCALL1(pv_mmu_ops.release_pmd, pfn);
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index db9ef5532341..b82bac975250 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -255,7 +255,6 @@ struct pv_mmu_ops {
255 */ 255 */
256 void (*alloc_pte)(struct mm_struct *mm, unsigned long pfn); 256 void (*alloc_pte)(struct mm_struct *mm, unsigned long pfn);
257 void (*alloc_pmd)(struct mm_struct *mm, unsigned long pfn); 257 void (*alloc_pmd)(struct mm_struct *mm, unsigned long pfn);
258 void (*alloc_pmd_clone)(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count);
259 void (*alloc_pud)(struct mm_struct *mm, unsigned long pfn); 258 void (*alloc_pud)(struct mm_struct *mm, unsigned long pfn);
260 void (*release_pte)(unsigned long pfn); 259 void (*release_pte)(unsigned long pfn);
261 void (*release_pmd)(unsigned long pfn); 260 void (*release_pmd)(unsigned long pfn);
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index a34c785c5a63..ada823a13c7c 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -28,6 +28,8 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
28extern spinlock_t pgd_lock; 28extern spinlock_t pgd_lock;
29extern struct list_head pgd_list; 29extern struct list_head pgd_list;
30 30
31extern struct mm_struct *pgd_page_get_mm(struct page *page);
32
31#ifdef CONFIG_PARAVIRT 33#ifdef CONFIG_PARAVIRT
32#include <asm/paravirt.h> 34#include <asm/paravirt.h>
33#else /* !CONFIG_PARAVIRT */ 35#else /* !CONFIG_PARAVIRT */
@@ -603,6 +605,8 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm,
603 pte_update(mm, addr, ptep); 605 pte_update(mm, addr, ptep);
604} 606}
605 607
608#define flush_tlb_fix_spurious_fault(vma, address)
609
606/* 610/*
607 * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); 611 * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
608 * 612 *
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 076052cd62be..f96ac9bedf75 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -102,6 +102,8 @@ static inline void native_pgd_clear(pgd_t *pgd)
102 native_set_pgd(pgd, native_make_pgd(0)); 102 native_set_pgd(pgd, native_make_pgd(0));
103} 103}
104 104
105extern void sync_global_pgds(unsigned long start, unsigned long end);
106
105/* 107/*
106 * Conversion functions: convert a page and protection to a page entry, 108 * Conversion functions: convert a page and protection to a page entry,
107 * and a page entry and page directory to the page they refer to. 109 * and a page entry and page directory to the page they refer to.
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 325b7bdbebaa..cae9c3cb95cf 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -110,6 +110,8 @@ struct cpuinfo_x86 {
110 u16 phys_proc_id; 110 u16 phys_proc_id;
111 /* Core id: */ 111 /* Core id: */
112 u16 cpu_core_id; 112 u16 cpu_core_id;
113 /* Compute unit id */
114 u8 compute_unit_id;
113 /* Index into per_cpu list: */ 115 /* Index into per_cpu list: */
114 u16 cpu_index; 116 u16 cpu_index;
115#endif 117#endif
@@ -602,7 +604,7 @@ extern unsigned long mmu_cr4_features;
602 604
603static inline void set_in_cr4(unsigned long mask) 605static inline void set_in_cr4(unsigned long mask)
604{ 606{
605 unsigned cr4; 607 unsigned long cr4;
606 608
607 mmu_cr4_features |= mask; 609 mmu_cr4_features |= mask;
608 cr4 = read_cr4(); 610 cr4 = read_cr4();
@@ -612,7 +614,7 @@ static inline void set_in_cr4(unsigned long mask)
612 614
613static inline void clear_in_cr4(unsigned long mask) 615static inline void clear_in_cr4(unsigned long mask)
614{ 616{
615 unsigned cr4; 617 unsigned long cr4;
616 618
617 mmu_cr4_features &= ~mask; 619 mmu_cr4_features &= ~mask;
618 cr4 = read_cr4(); 620 cr4 = read_cr4();
@@ -764,29 +766,6 @@ extern unsigned long idle_halt;
764extern unsigned long idle_nomwait; 766extern unsigned long idle_nomwait;
765extern bool c1e_detected; 767extern bool c1e_detected;
766 768
767/*
768 * on systems with caches, caches must be flashed as the absolute
769 * last instruction before going into a suspended halt. Otherwise,
770 * dirty data can linger in the cache and become stale on resume,
771 * leading to strange errors.
772 *
773 * perform a variety of operations to guarantee that the compiler
774 * will not reorder instructions. wbinvd itself is serializing
775 * so the processor will not reorder.
776 *
777 * Systems without cache can just go into halt.
778 */
779static inline void wbinvd_halt(void)
780{
781 mb();
782 /* check for clflush to determine if wbinvd is legal */
783 if (cpu_has_clflush)
784 asm volatile("cli; wbinvd; 1: hlt; jmp 1b" : : : "memory");
785 else
786 while (1)
787 halt();
788}
789
790extern void enable_sep_cpu(void); 769extern void enable_sep_cpu(void);
791extern int sysenter_setup(void); 770extern int sysenter_setup(void);
792 771
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index ef292c792d74..d6763b139a84 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -93,6 +93,11 @@ void *extend_brk(size_t size, size_t align);
93 : : "i" (sz)); \ 93 : : "i" (sz)); \
94 } 94 }
95 95
96/* Helper for reserving space for arrays of things */
97#define RESERVE_BRK_ARRAY(type, name, entries) \
98 type *name; \
99 RESERVE_BRK(name, sizeof(type) * entries)
100
96#ifdef __i386__ 101#ifdef __i386__
97 102
98void __init i386_start_kernel(void); 103void __init i386_start_kernel(void);
diff --git a/arch/x86/include/asm/vmi.h b/arch/x86/include/asm/vmi.h
deleted file mode 100644
index 61e08c0a2907..000000000000
--- a/arch/x86/include/asm/vmi.h
+++ /dev/null
@@ -1,269 +0,0 @@
1/*
2 * VMI interface definition
3 *
4 * Copyright (C) 2005, VMware, Inc.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
14 * NON INFRINGEMENT. See the GNU General Public License for more
15 * details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 * Maintained by: Zachary Amsden zach@vmware.com
22 *
23 */
24#include <linux/types.h>
25
26/*
27 *---------------------------------------------------------------------
28 *
29 * VMI Option ROM API
30 *
31 *---------------------------------------------------------------------
32 */
33#define VMI_SIGNATURE 0x696d5663 /* "cVmi" */
34
35#define PCI_VENDOR_ID_VMWARE 0x15AD
36#define PCI_DEVICE_ID_VMWARE_VMI 0x0801
37
38/*
39 * We use two version numbers for compatibility, with the major
40 * number signifying interface breakages, and the minor number
41 * interface extensions.
42 */
43#define VMI_API_REV_MAJOR 3
44#define VMI_API_REV_MINOR 0
45
46#define VMI_CALL_CPUID 0
47#define VMI_CALL_WRMSR 1
48#define VMI_CALL_RDMSR 2
49#define VMI_CALL_SetGDT 3
50#define VMI_CALL_SetLDT 4
51#define VMI_CALL_SetIDT 5
52#define VMI_CALL_SetTR 6
53#define VMI_CALL_GetGDT 7
54#define VMI_CALL_GetLDT 8
55#define VMI_CALL_GetIDT 9
56#define VMI_CALL_GetTR 10
57#define VMI_CALL_WriteGDTEntry 11
58#define VMI_CALL_WriteLDTEntry 12
59#define VMI_CALL_WriteIDTEntry 13
60#define VMI_CALL_UpdateKernelStack 14
61#define VMI_CALL_SetCR0 15
62#define VMI_CALL_SetCR2 16
63#define VMI_CALL_SetCR3 17
64#define VMI_CALL_SetCR4 18
65#define VMI_CALL_GetCR0 19
66#define VMI_CALL_GetCR2 20
67#define VMI_CALL_GetCR3 21
68#define VMI_CALL_GetCR4 22
69#define VMI_CALL_WBINVD 23
70#define VMI_CALL_SetDR 24
71#define VMI_CALL_GetDR 25
72#define VMI_CALL_RDPMC 26
73#define VMI_CALL_RDTSC 27
74#define VMI_CALL_CLTS 28
75#define VMI_CALL_EnableInterrupts 29
76#define VMI_CALL_DisableInterrupts 30
77#define VMI_CALL_GetInterruptMask 31
78#define VMI_CALL_SetInterruptMask 32
79#define VMI_CALL_IRET 33
80#define VMI_CALL_SYSEXIT 34
81#define VMI_CALL_Halt 35
82#define VMI_CALL_Reboot 36
83#define VMI_CALL_Shutdown 37
84#define VMI_CALL_SetPxE 38
85#define VMI_CALL_SetPxELong 39
86#define VMI_CALL_UpdatePxE 40
87#define VMI_CALL_UpdatePxELong 41
88#define VMI_CALL_MachineToPhysical 42
89#define VMI_CALL_PhysicalToMachine 43
90#define VMI_CALL_AllocatePage 44
91#define VMI_CALL_ReleasePage 45
92#define VMI_CALL_InvalPage 46
93#define VMI_CALL_FlushTLB 47
94#define VMI_CALL_SetLinearMapping 48
95
96#define VMI_CALL_SetIOPLMask 61
97#define VMI_CALL_SetInitialAPState 62
98#define VMI_CALL_APICWrite 63
99#define VMI_CALL_APICRead 64
100#define VMI_CALL_IODelay 65
101#define VMI_CALL_SetLazyMode 73
102
103/*
104 *---------------------------------------------------------------------
105 *
106 * MMU operation flags
107 *
108 *---------------------------------------------------------------------
109 */
110
111/* Flags used by VMI_{Allocate|Release}Page call */
112#define VMI_PAGE_PAE 0x10 /* Allocate PAE shadow */
113#define VMI_PAGE_CLONE 0x20 /* Clone from another shadow */
114#define VMI_PAGE_ZEROED 0x40 /* Page is pre-zeroed */
115
116
117/* Flags shared by Allocate|Release Page and PTE updates */
118#define VMI_PAGE_PT 0x01
119#define VMI_PAGE_PD 0x02
120#define VMI_PAGE_PDP 0x04
121#define VMI_PAGE_PML4 0x08
122
123#define VMI_PAGE_NORMAL 0x00 /* for debugging */
124
125/* Flags used by PTE updates */
126#define VMI_PAGE_CURRENT_AS 0x10 /* implies VMI_PAGE_VA_MASK is valid */
127#define VMI_PAGE_DEFER 0x20 /* may queue update until TLB inval */
128#define VMI_PAGE_VA_MASK 0xfffff000
129
130#ifdef CONFIG_X86_PAE
131#define VMI_PAGE_L1 (VMI_PAGE_PT | VMI_PAGE_PAE | VMI_PAGE_ZEROED)
132#define VMI_PAGE_L2 (VMI_PAGE_PD | VMI_PAGE_PAE | VMI_PAGE_ZEROED)
133#else
134#define VMI_PAGE_L1 (VMI_PAGE_PT | VMI_PAGE_ZEROED)
135#define VMI_PAGE_L2 (VMI_PAGE_PD | VMI_PAGE_ZEROED)
136#endif
137
138/* Flags used by VMI_FlushTLB call */
139#define VMI_FLUSH_TLB 0x01
140#define VMI_FLUSH_GLOBAL 0x02
141
142/*
143 *---------------------------------------------------------------------
144 *
145 * VMI relocation definitions for ROM call get_reloc
146 *
147 *---------------------------------------------------------------------
148 */
149
150/* VMI Relocation types */
151#define VMI_RELOCATION_NONE 0
152#define VMI_RELOCATION_CALL_REL 1
153#define VMI_RELOCATION_JUMP_REL 2
154#define VMI_RELOCATION_NOP 3
155
156#ifndef __ASSEMBLY__
157struct vmi_relocation_info {
158 unsigned char *eip;
159 unsigned char type;
160 unsigned char reserved[3];
161};
162#endif
163
164
165/*
166 *---------------------------------------------------------------------
167 *
168 * Generic ROM structures and definitions
169 *
170 *---------------------------------------------------------------------
171 */
172
173#ifndef __ASSEMBLY__
174
175struct vrom_header {
176 u16 rom_signature; /* option ROM signature */
177 u8 rom_length; /* ROM length in 512 byte chunks */
178 u8 rom_entry[4]; /* 16-bit code entry point */
179 u8 rom_pad0; /* 4-byte align pad */
180 u32 vrom_signature; /* VROM identification signature */
181 u8 api_version_min;/* Minor version of API */
182 u8 api_version_maj;/* Major version of API */
183 u8 jump_slots; /* Number of jump slots */
184 u8 reserved1; /* Reserved for expansion */
185 u32 virtual_top; /* Hypervisor virtual address start */
186 u16 reserved2; /* Reserved for expansion */
187 u16 license_offs; /* Offset to License string */
188 u16 pci_header_offs;/* Offset to PCI OPROM header */
189 u16 pnp_header_offs;/* Offset to PnP OPROM header */
190 u32 rom_pad3; /* PnP reserverd / VMI reserved */
191 u8 reserved[96]; /* Reserved for headers */
192 char vmi_init[8]; /* VMI_Init jump point */
193 char get_reloc[8]; /* VMI_GetRelocationInfo jump point */
194} __attribute__((packed));
195
196struct pnp_header {
197 char sig[4];
198 char rev;
199 char size;
200 short next;
201 short res;
202 long devID;
203 unsigned short manufacturer_offset;
204 unsigned short product_offset;
205} __attribute__((packed));
206
207struct pci_header {
208 char sig[4];
209 short vendorID;
210 short deviceID;
211 short vpdData;
212 short size;
213 char rev;
214 char class;
215 char subclass;
216 char interface;
217 short chunks;
218 char rom_version_min;
219 char rom_version_maj;
220 char codetype;
221 char lastRom;
222 short reserved;
223} __attribute__((packed));
224
225/* Function prototypes for bootstrapping */
226#ifdef CONFIG_VMI
227extern void vmi_init(void);
228extern void vmi_activate(void);
229extern void vmi_bringup(void);
230#else
231static inline void vmi_init(void) {}
232static inline void vmi_activate(void) {}
233static inline void vmi_bringup(void) {}
234#endif
235
236/* State needed to start an application processor in an SMP system. */
237struct vmi_ap_state {
238 u32 cr0;
239 u32 cr2;
240 u32 cr3;
241 u32 cr4;
242
243 u64 efer;
244
245 u32 eip;
246 u32 eflags;
247 u32 eax;
248 u32 ebx;
249 u32 ecx;
250 u32 edx;
251 u32 esp;
252 u32 ebp;
253 u32 esi;
254 u32 edi;
255 u16 cs;
256 u16 ss;
257 u16 ds;
258 u16 es;
259 u16 fs;
260 u16 gs;
261 u16 ldtr;
262
263 u16 gdtr_limit;
264 u32 gdtr_base;
265 u32 idtr_base;
266 u16 idtr_limit;
267};
268
269#endif
diff --git a/arch/x86/include/asm/vmi_time.h b/arch/x86/include/asm/vmi_time.h
deleted file mode 100644
index c6e0bee93e3c..000000000000
--- a/arch/x86/include/asm/vmi_time.h
+++ /dev/null
@@ -1,98 +0,0 @@
1/*
2 * VMI Time wrappers
3 *
4 * Copyright (C) 2006, VMware, Inc.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
14 * NON INFRINGEMENT. See the GNU General Public License for more
15 * details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 * Send feedback to dhecht@vmware.com
22 *
23 */
24
25#ifndef _ASM_X86_VMI_TIME_H
26#define _ASM_X86_VMI_TIME_H
27
28/*
29 * Raw VMI call indices for timer functions
30 */
31#define VMI_CALL_GetCycleFrequency 66
32#define VMI_CALL_GetCycleCounter 67
33#define VMI_CALL_SetAlarm 68
34#define VMI_CALL_CancelAlarm 69
35#define VMI_CALL_GetWallclockTime 70
36#define VMI_CALL_WallclockUpdated 71
37
38/* Cached VMI timer operations */
39extern struct vmi_timer_ops {
40 u64 (*get_cycle_frequency)(void);
41 u64 (*get_cycle_counter)(int);
42 u64 (*get_wallclock)(void);
43 int (*wallclock_updated)(void);
44 void (*set_alarm)(u32 flags, u64 expiry, u64 period);
45 void (*cancel_alarm)(u32 flags);
46} vmi_timer_ops;
47
48/* Prototypes */
49extern void __init vmi_time_init(void);
50extern unsigned long vmi_get_wallclock(void);
51extern int vmi_set_wallclock(unsigned long now);
52extern unsigned long long vmi_sched_clock(void);
53extern unsigned long vmi_tsc_khz(void);
54
55#ifdef CONFIG_X86_LOCAL_APIC
56extern void __devinit vmi_time_bsp_init(void);
57extern void __devinit vmi_time_ap_init(void);
58#endif
59
60/*
61 * When run under a hypervisor, a vcpu is always in one of three states:
62 * running, halted, or ready. The vcpu is in the 'running' state if it
63 * is executing. When the vcpu executes the halt interface, the vcpu
64 * enters the 'halted' state and remains halted until there is some work
65 * pending for the vcpu (e.g. an alarm expires, host I/O completes on
66 * behalf of virtual I/O). At this point, the vcpu enters the 'ready'
67 * state (waiting for the hypervisor to reschedule it). Finally, at any
68 * time when the vcpu is not in the 'running' state nor the 'halted'
69 * state, it is in the 'ready' state.
70 *
71 * Real time is advances while the vcpu is 'running', 'ready', or
72 * 'halted'. Stolen time is the time in which the vcpu is in the
73 * 'ready' state. Available time is the remaining time -- the vcpu is
74 * either 'running' or 'halted'.
75 *
76 * All three views of time are accessible through the VMI cycle
77 * counters.
78 */
79
80/* The cycle counters. */
81#define VMI_CYCLES_REAL 0
82#define VMI_CYCLES_AVAILABLE 1
83#define VMI_CYCLES_STOLEN 2
84
85/* The alarm interface 'flags' bits */
86#define VMI_ALARM_COUNTERS 2
87
88#define VMI_ALARM_COUNTER_MASK 0x000000ff
89
90#define VMI_ALARM_WIRED_IRQ0 0x00000000
91#define VMI_ALARM_WIRED_LVTT 0x00010000
92
93#define VMI_ALARM_IS_ONESHOT 0x00000000
94#define VMI_ALARM_IS_PERIODIC 0x00000100
95
96#define CONFIG_VMI_ALARM_HZ 100
97
98#endif /* _ASM_X86_VMI_TIME_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 7490bf8d1459..80a93dc99076 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -86,15 +86,15 @@ obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o
86obj-$(CONFIG_KGDB) += kgdb.o 86obj-$(CONFIG_KGDB) += kgdb.o
87obj-$(CONFIG_VM86) += vm86_32.o 87obj-$(CONFIG_VM86) += vm86_32.o
88obj-$(CONFIG_EARLY_PRINTK) += early_printk.o 88obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
89obj-$(CONFIG_EARLY_PRINTK_MRST) += early_printk_mrst.o
89 90
90obj-$(CONFIG_HPET_TIMER) += hpet.o 91obj-$(CONFIG_HPET_TIMER) += hpet.o
91obj-$(CONFIG_APB_TIMER) += apb_timer.o 92obj-$(CONFIG_APB_TIMER) += apb_timer.o
92 93
93obj-$(CONFIG_K8_NB) += k8.o 94obj-$(CONFIG_AMD_NB) += amd_nb.o
94obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o 95obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o
95obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o 96obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
96 97
97obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o
98obj-$(CONFIG_KVM_GUEST) += kvm.o 98obj-$(CONFIG_KVM_GUEST) += kvm.o
99obj-$(CONFIG_KVM_CLOCK) += kvmclock.o 99obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
100obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o 100obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
@@ -107,6 +107,7 @@ obj-$(CONFIG_SCx200) += scx200.o
107scx200-y += scx200_32.o 107scx200-y += scx200_32.o
108 108
109obj-$(CONFIG_OLPC) += olpc.o 109obj-$(CONFIG_OLPC) += olpc.o
110obj-$(CONFIG_OLPC_XO1) += olpc-xo1.o
110obj-$(CONFIG_OLPC_OPENFIRMWARE) += olpc_ofw.o 111obj-$(CONFIG_OLPC_OPENFIRMWARE) += olpc_ofw.o
111obj-$(CONFIG_X86_MRST) += mrst.o 112obj-$(CONFIG_X86_MRST) += mrst.o
112 113
@@ -123,7 +124,6 @@ obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o
123# 64 bit specific files 124# 64 bit specific files
124ifeq ($(CONFIG_X86_64),y) 125ifeq ($(CONFIG_X86_64),y)
125 obj-$(CONFIG_X86_UV) += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o 126 obj-$(CONFIG_X86_UV) += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o
126 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
127 obj-$(CONFIG_AUDIT) += audit_64.o 127 obj-$(CONFIG_AUDIT) += audit_64.o
128 128
129 obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o 129 obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index fb16f17e59be..5812404a0d4c 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -13,6 +13,7 @@
13 13
14#include <acpi/processor.h> 14#include <acpi/processor.h>
15#include <asm/acpi.h> 15#include <asm/acpi.h>
16#include <asm/mwait.h>
16 17
17/* 18/*
18 * Initialize bm_flags based on the CPU cache properties 19 * Initialize bm_flags based on the CPU cache properties
@@ -65,16 +66,6 @@ static struct cstate_entry __percpu *cpu_cstate_entry; /* per CPU ptr */
65 66
66static short mwait_supported[ACPI_PROCESSOR_MAX_POWER]; 67static short mwait_supported[ACPI_PROCESSOR_MAX_POWER];
67 68
68#define MWAIT_SUBSTATE_MASK (0xf)
69#define MWAIT_CSTATE_MASK (0xf)
70#define MWAIT_SUBSTATE_SIZE (4)
71
72#define CPUID_MWAIT_LEAF (5)
73#define CPUID5_ECX_EXTENSIONS_SUPPORTED (0x1)
74#define CPUID5_ECX_INTERRUPT_BREAK (0x2)
75
76#define MWAIT_ECX_INTERRUPT_BREAK (0x1)
77
78#define NATIVE_CSTATE_BEYOND_HALT (2) 69#define NATIVE_CSTATE_BEYOND_HALT (2)
79 70
80static long acpi_processor_ffh_cstate_probe_cpu(void *_cx) 71static long acpi_processor_ffh_cstate_probe_cpu(void *_cx)
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 679b6450382b..d2fdb0826df2 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. 2 * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com> 3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com> 4 * Leo Duran <leo.duran@amd.com>
5 * 5 *
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 5a170cbbbed8..3cb482e123de 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. 2 * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com> 3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com> 4 * Leo Duran <leo.duran@amd.com>
5 * 5 *
@@ -194,6 +194,39 @@ static inline unsigned long tbl_size(int entry_size)
194 return 1UL << shift; 194 return 1UL << shift;
195} 195}
196 196
197/* Access to l1 and l2 indexed register spaces */
198
199static u32 iommu_read_l1(struct amd_iommu *iommu, u16 l1, u8 address)
200{
201 u32 val;
202
203 pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
204 pci_read_config_dword(iommu->dev, 0xfc, &val);
205 return val;
206}
207
208static void iommu_write_l1(struct amd_iommu *iommu, u16 l1, u8 address, u32 val)
209{
210 pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16 | 1 << 31));
211 pci_write_config_dword(iommu->dev, 0xfc, val);
212 pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
213}
214
215static u32 iommu_read_l2(struct amd_iommu *iommu, u8 address)
216{
217 u32 val;
218
219 pci_write_config_dword(iommu->dev, 0xf0, address);
220 pci_read_config_dword(iommu->dev, 0xf4, &val);
221 return val;
222}
223
224static void iommu_write_l2(struct amd_iommu *iommu, u8 address, u32 val)
225{
226 pci_write_config_dword(iommu->dev, 0xf0, (address | 1 << 8));
227 pci_write_config_dword(iommu->dev, 0xf4, val);
228}
229
197/**************************************************************************** 230/****************************************************************************
198 * 231 *
199 * AMD IOMMU MMIO register space handling functions 232 * AMD IOMMU MMIO register space handling functions
@@ -619,6 +652,7 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu)
619{ 652{
620 int cap_ptr = iommu->cap_ptr; 653 int cap_ptr = iommu->cap_ptr;
621 u32 range, misc; 654 u32 range, misc;
655 int i, j;
622 656
623 pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET, 657 pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET,
624 &iommu->cap); 658 &iommu->cap);
@@ -633,12 +667,29 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu)
633 MMIO_GET_LD(range)); 667 MMIO_GET_LD(range));
634 iommu->evt_msi_num = MMIO_MSI_NUM(misc); 668 iommu->evt_msi_num = MMIO_MSI_NUM(misc);
635 669
636 if (is_rd890_iommu(iommu->dev)) { 670 if (!is_rd890_iommu(iommu->dev))
637 pci_read_config_dword(iommu->dev, 0xf0, &iommu->cache_cfg[0]); 671 return;
638 pci_read_config_dword(iommu->dev, 0xf4, &iommu->cache_cfg[1]); 672
639 pci_read_config_dword(iommu->dev, 0xf8, &iommu->cache_cfg[2]); 673 /*
640 pci_read_config_dword(iommu->dev, 0xfc, &iommu->cache_cfg[3]); 674 * Some rd890 systems may not be fully reconfigured by the BIOS, so
641 } 675 * it's necessary for us to store this information so it can be
676 * reprogrammed on resume
677 */
678
679 pci_read_config_dword(iommu->dev, iommu->cap_ptr + 4,
680 &iommu->stored_addr_lo);
681 pci_read_config_dword(iommu->dev, iommu->cap_ptr + 8,
682 &iommu->stored_addr_hi);
683
684 /* Low bit locks writes to configuration space */
685 iommu->stored_addr_lo &= ~1;
686
687 for (i = 0; i < 6; i++)
688 for (j = 0; j < 0x12; j++)
689 iommu->stored_l1[i][j] = iommu_read_l1(iommu, i, j);
690
691 for (i = 0; i < 0x83; i++)
692 iommu->stored_l2[i] = iommu_read_l2(iommu, i);
642} 693}
643 694
644/* 695/*
@@ -1127,14 +1178,53 @@ static void iommu_init_flags(struct amd_iommu *iommu)
1127 iommu_feature_enable(iommu, CONTROL_COHERENT_EN); 1178 iommu_feature_enable(iommu, CONTROL_COHERENT_EN);
1128} 1179}
1129 1180
1130static void iommu_apply_quirks(struct amd_iommu *iommu) 1181static void iommu_apply_resume_quirks(struct amd_iommu *iommu)
1131{ 1182{
1132 if (is_rd890_iommu(iommu->dev)) { 1183 int i, j;
1133 pci_write_config_dword(iommu->dev, 0xf0, iommu->cache_cfg[0]); 1184 u32 ioc_feature_control;
1134 pci_write_config_dword(iommu->dev, 0xf4, iommu->cache_cfg[1]); 1185 struct pci_dev *pdev = NULL;
1135 pci_write_config_dword(iommu->dev, 0xf8, iommu->cache_cfg[2]); 1186
1136 pci_write_config_dword(iommu->dev, 0xfc, iommu->cache_cfg[3]); 1187 /* RD890 BIOSes may not have completely reconfigured the iommu */
1137 } 1188 if (!is_rd890_iommu(iommu->dev))
1189 return;
1190
1191 /*
1192 * First, we need to ensure that the iommu is enabled. This is
1193 * controlled by a register in the northbridge
1194 */
1195 pdev = pci_get_bus_and_slot(iommu->dev->bus->number, PCI_DEVFN(0, 0));
1196
1197 if (!pdev)
1198 return;
1199
1200 /* Select Northbridge indirect register 0x75 and enable writing */
1201 pci_write_config_dword(pdev, 0x60, 0x75 | (1 << 7));
1202 pci_read_config_dword(pdev, 0x64, &ioc_feature_control);
1203
1204 /* Enable the iommu */
1205 if (!(ioc_feature_control & 0x1))
1206 pci_write_config_dword(pdev, 0x64, ioc_feature_control | 1);
1207
1208 pci_dev_put(pdev);
1209
1210 /* Restore the iommu BAR */
1211 pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
1212 iommu->stored_addr_lo);
1213 pci_write_config_dword(iommu->dev, iommu->cap_ptr + 8,
1214 iommu->stored_addr_hi);
1215
1216 /* Restore the l1 indirect regs for each of the 6 l1s */
1217 for (i = 0; i < 6; i++)
1218 for (j = 0; j < 0x12; j++)
1219 iommu_write_l1(iommu, i, j, iommu->stored_l1[i][j]);
1220
1221 /* Restore the l2 indirect regs */
1222 for (i = 0; i < 0x83; i++)
1223 iommu_write_l2(iommu, i, iommu->stored_l2[i]);
1224
1225 /* Lock PCI setup registers */
1226 pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
1227 iommu->stored_addr_lo | 1);
1138} 1228}
1139 1229
1140/* 1230/*
@@ -1147,7 +1237,6 @@ static void enable_iommus(void)
1147 1237
1148 for_each_iommu(iommu) { 1238 for_each_iommu(iommu) {
1149 iommu_disable(iommu); 1239 iommu_disable(iommu);
1150 iommu_apply_quirks(iommu);
1151 iommu_init_flags(iommu); 1240 iommu_init_flags(iommu);
1152 iommu_set_device_table(iommu); 1241 iommu_set_device_table(iommu);
1153 iommu_enable_command_buffer(iommu); 1242 iommu_enable_command_buffer(iommu);
@@ -1173,6 +1262,11 @@ static void disable_iommus(void)
1173 1262
1174static int amd_iommu_resume(struct sys_device *dev) 1263static int amd_iommu_resume(struct sys_device *dev)
1175{ 1264{
1265 struct amd_iommu *iommu;
1266
1267 for_each_iommu(iommu)
1268 iommu_apply_resume_quirks(iommu);
1269
1176 /* re-load the hardware */ 1270 /* re-load the hardware */
1177 enable_iommus(); 1271 enable_iommus();
1178 1272
diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/amd_nb.c
index 0f7bc20cfcde..8f6463d8ed0d 100644
--- a/arch/x86/kernel/k8.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -8,21 +8,19 @@
8#include <linux/errno.h> 8#include <linux/errno.h>
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/spinlock.h> 10#include <linux/spinlock.h>
11#include <asm/k8.h> 11#include <asm/amd_nb.h>
12
13int num_k8_northbridges;
14EXPORT_SYMBOL(num_k8_northbridges);
15 12
16static u32 *flush_words; 13static u32 *flush_words;
17 14
18struct pci_device_id k8_nb_ids[] = { 15struct pci_device_id k8_nb_ids[] = {
19 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, 16 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
20 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, 17 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
18 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_MISC) },
21 {} 19 {}
22}; 20};
23EXPORT_SYMBOL(k8_nb_ids); 21EXPORT_SYMBOL(k8_nb_ids);
24 22
25struct pci_dev **k8_northbridges; 23struct k8_northbridge_info k8_northbridges;
26EXPORT_SYMBOL(k8_northbridges); 24EXPORT_SYMBOL(k8_northbridges);
27 25
28static struct pci_dev *next_k8_northbridge(struct pci_dev *dev) 26static struct pci_dev *next_k8_northbridge(struct pci_dev *dev)
@@ -40,36 +38,45 @@ int cache_k8_northbridges(void)
40 int i; 38 int i;
41 struct pci_dev *dev; 39 struct pci_dev *dev;
42 40
43 if (num_k8_northbridges) 41 if (k8_northbridges.num)
44 return 0; 42 return 0;
45 43
46 dev = NULL; 44 dev = NULL;
47 while ((dev = next_k8_northbridge(dev)) != NULL) 45 while ((dev = next_k8_northbridge(dev)) != NULL)
48 num_k8_northbridges++; 46 k8_northbridges.num++;
47
48 /* some CPU families (e.g. family 0x11) do not support GART */
49 if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 ||
50 boot_cpu_data.x86 == 0x15)
51 k8_northbridges.gart_supported = 1;
49 52
50 k8_northbridges = kmalloc((num_k8_northbridges + 1) * sizeof(void *), 53 k8_northbridges.nb_misc = kmalloc((k8_northbridges.num + 1) *
51 GFP_KERNEL); 54 sizeof(void *), GFP_KERNEL);
52 if (!k8_northbridges) 55 if (!k8_northbridges.nb_misc)
53 return -ENOMEM; 56 return -ENOMEM;
54 57
55 if (!num_k8_northbridges) { 58 if (!k8_northbridges.num) {
56 k8_northbridges[0] = NULL; 59 k8_northbridges.nb_misc[0] = NULL;
57 return 0; 60 return 0;
58 } 61 }
59 62
60 flush_words = kmalloc(num_k8_northbridges * sizeof(u32), GFP_KERNEL); 63 if (k8_northbridges.gart_supported) {
61 if (!flush_words) { 64 flush_words = kmalloc(k8_northbridges.num * sizeof(u32),
62 kfree(k8_northbridges); 65 GFP_KERNEL);
63 return -ENOMEM; 66 if (!flush_words) {
67 kfree(k8_northbridges.nb_misc);
68 return -ENOMEM;
69 }
64 } 70 }
65 71
66 dev = NULL; 72 dev = NULL;
67 i = 0; 73 i = 0;
68 while ((dev = next_k8_northbridge(dev)) != NULL) { 74 while ((dev = next_k8_northbridge(dev)) != NULL) {
69 k8_northbridges[i] = dev; 75 k8_northbridges.nb_misc[i] = dev;
70 pci_read_config_dword(dev, 0x9c, &flush_words[i++]); 76 if (k8_northbridges.gart_supported)
77 pci_read_config_dword(dev, 0x9c, &flush_words[i++]);
71 } 78 }
72 k8_northbridges[i] = NULL; 79 k8_northbridges.nb_misc[i] = NULL;
73 return 0; 80 return 0;
74} 81}
75EXPORT_SYMBOL_GPL(cache_k8_northbridges); 82EXPORT_SYMBOL_GPL(cache_k8_northbridges);
@@ -93,22 +100,25 @@ void k8_flush_garts(void)
93 unsigned long flags; 100 unsigned long flags;
94 static DEFINE_SPINLOCK(gart_lock); 101 static DEFINE_SPINLOCK(gart_lock);
95 102
103 if (!k8_northbridges.gart_supported)
104 return;
105
96 /* Avoid races between AGP and IOMMU. In theory it's not needed 106 /* Avoid races between AGP and IOMMU. In theory it's not needed
97 but I'm not sure if the hardware won't lose flush requests 107 but I'm not sure if the hardware won't lose flush requests
98 when another is pending. This whole thing is so expensive anyways 108 when another is pending. This whole thing is so expensive anyways
99 that it doesn't matter to serialize more. -AK */ 109 that it doesn't matter to serialize more. -AK */
100 spin_lock_irqsave(&gart_lock, flags); 110 spin_lock_irqsave(&gart_lock, flags);
101 flushed = 0; 111 flushed = 0;
102 for (i = 0; i < num_k8_northbridges; i++) { 112 for (i = 0; i < k8_northbridges.num; i++) {
103 pci_write_config_dword(k8_northbridges[i], 0x9c, 113 pci_write_config_dword(k8_northbridges.nb_misc[i], 0x9c,
104 flush_words[i]|1); 114 flush_words[i]|1);
105 flushed++; 115 flushed++;
106 } 116 }
107 for (i = 0; i < num_k8_northbridges; i++) { 117 for (i = 0; i < k8_northbridges.num; i++) {
108 u32 w; 118 u32 w;
109 /* Make sure the hardware actually executed the flush*/ 119 /* Make sure the hardware actually executed the flush*/
110 for (;;) { 120 for (;;) {
111 pci_read_config_dword(k8_northbridges[i], 121 pci_read_config_dword(k8_northbridges.nb_misc[i],
112 0x9c, &w); 122 0x9c, &w);
113 if (!(w & 1)) 123 if (!(w & 1))
114 break; 124 break;
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index 8dd77800ff5d..92543c73cf8e 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -231,34 +231,6 @@ static void apbt_restart_clocksource(struct clocksource *cs)
231 apbt_start_counter(phy_cs_timer_id); 231 apbt_start_counter(phy_cs_timer_id);
232} 232}
233 233
234/* Setup IRQ routing via IOAPIC */
235#ifdef CONFIG_SMP
236static void apbt_setup_irq(struct apbt_dev *adev)
237{
238 struct irq_chip *chip;
239 struct irq_desc *desc;
240
241 /* timer0 irq has been setup early */
242 if (adev->irq == 0)
243 return;
244 desc = irq_to_desc(adev->irq);
245 chip = get_irq_chip(adev->irq);
246 disable_irq(adev->irq);
247 desc->status |= IRQ_MOVE_PCNTXT;
248 irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
249 /* APB timer irqs are set up as mp_irqs, timer is edge triggerred */
250 set_irq_chip_and_handler_name(adev->irq, chip, handle_edge_irq, "edge");
251 enable_irq(adev->irq);
252 if (system_state == SYSTEM_BOOTING)
253 if (request_irq(adev->irq, apbt_interrupt_handler,
254 IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING,
255 adev->name, adev)) {
256 printk(KERN_ERR "Failed request IRQ for APBT%d\n",
257 adev->num);
258 }
259}
260#endif
261
262static void apbt_enable_int(int n) 234static void apbt_enable_int(int n)
263{ 235{
264 unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); 236 unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
@@ -334,6 +306,27 @@ static int __init apbt_clockevent_register(void)
334} 306}
335 307
336#ifdef CONFIG_SMP 308#ifdef CONFIG_SMP
309
310static void apbt_setup_irq(struct apbt_dev *adev)
311{
312 /* timer0 irq has been setup early */
313 if (adev->irq == 0)
314 return;
315
316 if (system_state == SYSTEM_BOOTING) {
317 irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT);
318 /* APB timer irqs are set up as mp_irqs, timer is edge type */
319 __set_irq_handler(adev->irq, handle_edge_irq, 0, "edge");
320 if (request_irq(adev->irq, apbt_interrupt_handler,
321 IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING,
322 adev->name, adev)) {
323 printk(KERN_ERR "Failed request IRQ for APBT%d\n",
324 adev->num);
325 }
326 } else
327 enable_irq(adev->irq);
328}
329
337/* Should be called with per cpu */ 330/* Should be called with per cpu */
338void apbt_setup_secondary_clock(void) 331void apbt_setup_secondary_clock(void)
339{ 332{
@@ -343,7 +336,7 @@ void apbt_setup_secondary_clock(void)
343 336
344 /* Don't register boot CPU clockevent */ 337 /* Don't register boot CPU clockevent */
345 cpu = smp_processor_id(); 338 cpu = smp_processor_id();
346 if (cpu == boot_cpu_id) 339 if (!cpu)
347 return; 340 return;
348 /* 341 /*
349 * We need to calculate the scaled math multiplication factor for 342 * We need to calculate the scaled math multiplication factor for
@@ -389,16 +382,17 @@ static int apbt_cpuhp_notify(struct notifier_block *n,
389 382
390 switch (action & 0xf) { 383 switch (action & 0xf) {
391 case CPU_DEAD: 384 case CPU_DEAD:
385 disable_irq(adev->irq);
392 apbt_disable_int(cpu); 386 apbt_disable_int(cpu);
393 if (system_state == SYSTEM_RUNNING) 387 if (system_state == SYSTEM_RUNNING) {
394 pr_debug("skipping APBT CPU %lu offline\n", cpu); 388 pr_debug("skipping APBT CPU %lu offline\n", cpu);
395 else if (adev) { 389 } else if (adev) {
396 pr_debug("APBT clockevent for cpu %lu offline\n", cpu); 390 pr_debug("APBT clockevent for cpu %lu offline\n", cpu);
397 free_irq(adev->irq, adev); 391 free_irq(adev->irq, adev);
398 } 392 }
399 break; 393 break;
400 default: 394 default:
401 pr_debug(KERN_INFO "APBT notified %lu, no action\n", action); 395 pr_debug("APBT notified %lu, no action\n", action);
402 } 396 }
403 return NOTIFY_OK; 397 return NOTIFY_OK;
404} 398}
@@ -552,7 +546,7 @@ bad_count:
552 pr_debug("APB CS going back %lx:%lx:%lx ", 546 pr_debug("APB CS going back %lx:%lx:%lx ",
553 t2, last_read, t2 - last_read); 547 t2, last_read, t2 - last_read);
554bad_count_x3: 548bad_count_x3:
555 pr_debug(KERN_INFO "tripple check enforced\n"); 549 pr_debug("triple check enforced\n");
556 t0 = apbt_readl(phy_cs_timer_id, 550 t0 = apbt_readl(phy_cs_timer_id,
557 APBTMR_N_CURRENT_VALUE); 551 APBTMR_N_CURRENT_VALUE);
558 udelay(1); 552 udelay(1);
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index a2e0caf26e17..377f5db3b8b4 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -27,7 +27,7 @@
27#include <asm/gart.h> 27#include <asm/gart.h>
28#include <asm/pci-direct.h> 28#include <asm/pci-direct.h>
29#include <asm/dma.h> 29#include <asm/dma.h>
30#include <asm/k8.h> 30#include <asm/amd_nb.h>
31#include <asm/x86_init.h> 31#include <asm/x86_init.h>
32 32
33int gart_iommu_aperture; 33int gart_iommu_aperture;
@@ -307,7 +307,7 @@ void __init early_gart_iommu_check(void)
307 continue; 307 continue;
308 308
309 ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); 309 ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
310 aper_enabled = ctl & AMD64_GARTEN; 310 aper_enabled = ctl & GARTEN;
311 aper_order = (ctl >> 1) & 7; 311 aper_order = (ctl >> 1) & 7;
312 aper_size = (32 * 1024 * 1024) << aper_order; 312 aper_size = (32 * 1024 * 1024) << aper_order;
313 aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff; 313 aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff;
@@ -362,7 +362,7 @@ void __init early_gart_iommu_check(void)
362 continue; 362 continue;
363 363
364 ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); 364 ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
365 ctl &= ~AMD64_GARTEN; 365 ctl &= ~GARTEN;
366 write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl); 366 write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
367 } 367 }
368 } 368 }
@@ -505,8 +505,13 @@ out:
505 505
506 /* Fix up the north bridges */ 506 /* Fix up the north bridges */
507 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { 507 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
508 int bus; 508 int bus, dev_base, dev_limit;
509 int dev_base, dev_limit; 509
510 /*
511 * Don't enable translation yet but enable GART IO and CPU
512 * accesses and set DISTLBWALKPRB since GART table memory is UC.
513 */
514 u32 ctl = DISTLBWALKPRB | aper_order << 1;
510 515
511 bus = bus_dev_ranges[i].bus; 516 bus = bus_dev_ranges[i].bus;
512 dev_base = bus_dev_ranges[i].dev_base; 517 dev_base = bus_dev_ranges[i].dev_base;
@@ -515,10 +520,7 @@ out:
515 if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) 520 if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00)))
516 continue; 521 continue;
517 522
518 /* Don't enable translation yet. That is done later. 523 write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
519 Assume this BIOS didn't initialise the GART so
520 just overwrite all previous bits */
521 write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, aper_order << 1);
522 write_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE, aper_alloc >> 25); 524 write_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE, aper_alloc >> 25);
523 } 525 }
524 } 526 }
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index e3b534cda49a..850657d1b0ed 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -52,6 +52,7 @@
52#include <asm/mce.h> 52#include <asm/mce.h>
53#include <asm/kvm_para.h> 53#include <asm/kvm_para.h>
54#include <asm/tsc.h> 54#include <asm/tsc.h>
55#include <asm/atomic.h>
55 56
56unsigned int num_processors; 57unsigned int num_processors;
57 58
@@ -370,38 +371,87 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
370} 371}
371 372
372/* 373/*
373 * Setup extended LVT, AMD specific (K8, family 10h) 374 * Setup extended LVT, AMD specific
374 * 375 *
375 * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and 376 * Software should use the LVT offsets the BIOS provides. The offsets
376 * MCE interrupts are supported. Thus MCE offset must be set to 0. 377 * are determined by the subsystems using it like those for MCE
378 * threshold or IBS. On K8 only offset 0 (APIC500) and MCE interrupts
379 * are supported. Beginning with family 10h at least 4 offsets are
380 * available.
377 * 381 *
378 * If mask=1, the LVT entry does not generate interrupts while mask=0 382 * Since the offsets must be consistent for all cores, we keep track
379 * enables the vector. See also the BKDGs. 383 * of the LVT offsets in software and reserve the offset for the same
384 * vector also to be used on other cores. An offset is freed by
385 * setting the entry to APIC_EILVT_MASKED.
386 *
387 * If the BIOS is right, there should be no conflicts. Otherwise a
388 * "[Firmware Bug]: ..." error message is generated. However, if
389 * software does not properly determines the offsets, it is not
390 * necessarily a BIOS bug.
380 */ 391 */
381 392
382#define APIC_EILVT_LVTOFF_MCE 0 393static atomic_t eilvt_offsets[APIC_EILVT_NR_MAX];
383#define APIC_EILVT_LVTOFF_IBS 1
384 394
385static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask) 395static inline int eilvt_entry_is_changeable(unsigned int old, unsigned int new)
386{ 396{
387 unsigned long reg = (lvt_off << 4) + APIC_EILVTn(0); 397 return (old & APIC_EILVT_MASKED)
388 unsigned int v = (mask << 16) | (msg_type << 8) | vector; 398 || (new == APIC_EILVT_MASKED)
389 399 || ((new & ~APIC_EILVT_MASKED) == old);
390 apic_write(reg, v);
391} 400}
392 401
393u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask) 402static unsigned int reserve_eilvt_offset(int offset, unsigned int new)
394{ 403{
395 setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask); 404 unsigned int rsvd; /* 0: uninitialized */
396 return APIC_EILVT_LVTOFF_MCE; 405
406 if (offset >= APIC_EILVT_NR_MAX)
407 return ~0;
408
409 rsvd = atomic_read(&eilvt_offsets[offset]) & ~APIC_EILVT_MASKED;
410 do {
411 if (rsvd &&
412 !eilvt_entry_is_changeable(rsvd, new))
413 /* may not change if vectors are different */
414 return rsvd;
415 rsvd = atomic_cmpxchg(&eilvt_offsets[offset], rsvd, new);
416 } while (rsvd != new);
417
418 return new;
397} 419}
398 420
399u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask) 421/*
422 * If mask=1, the LVT entry does not generate interrupts while mask=0
423 * enables the vector. See also the BKDGs.
424 */
425
426int setup_APIC_eilvt(u8 offset, u8 vector, u8 msg_type, u8 mask)
400{ 427{
401 setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask); 428 unsigned long reg = APIC_EILVTn(offset);
402 return APIC_EILVT_LVTOFF_IBS; 429 unsigned int new, old, reserved;
430
431 new = (mask << 16) | (msg_type << 8) | vector;
432 old = apic_read(reg);
433 reserved = reserve_eilvt_offset(offset, new);
434
435 if (reserved != new) {
436 pr_err(FW_BUG "cpu %d, try to setup vector 0x%x, but "
437 "vector 0x%x was already reserved by another core, "
438 "APIC%lX=0x%x\n",
439 smp_processor_id(), new, reserved, reg, old);
440 return -EINVAL;
441 }
442
443 if (!eilvt_entry_is_changeable(old, new)) {
444 pr_err(FW_BUG "cpu %d, try to setup vector 0x%x but "
445 "register already in use, APIC%lX=0x%x\n",
446 smp_processor_id(), new, reg, old);
447 return -EBUSY;
448 }
449
450 apic_write(reg, new);
451
452 return 0;
403} 453}
404EXPORT_SYMBOL_GPL(setup_APIC_eilvt_ibs); 454EXPORT_SYMBOL_GPL(setup_APIC_eilvt);
405 455
406/* 456/*
407 * Program the next event, relative to now 457 * Program the next event, relative to now
@@ -1665,10 +1715,7 @@ int __init APIC_init_uniprocessor(void)
1665 } 1715 }
1666#endif 1716#endif
1667 1717
1668#ifndef CONFIG_SMP
1669 enable_IR_x2apic();
1670 default_setup_apic_routing(); 1718 default_setup_apic_routing();
1671#endif
1672 1719
1673 verify_local_APIC(); 1720 verify_local_APIC();
1674 connect_bsp_APIC(); 1721 connect_bsp_APIC();
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 5c5b8f3dddb5..8ae808d110f4 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -131,13 +131,9 @@ struct irq_pin_list {
131 struct irq_pin_list *next; 131 struct irq_pin_list *next;
132}; 132};
133 133
134static struct irq_pin_list *get_one_free_irq_2_pin(int node) 134static struct irq_pin_list *alloc_irq_pin_list(int node)
135{ 135{
136 struct irq_pin_list *pin; 136 return kzalloc_node(sizeof(struct irq_pin_list), GFP_KERNEL, node);
137
138 pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node);
139
140 return pin;
141} 137}
142 138
143/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ 139/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
@@ -150,10 +146,7 @@ static struct irq_cfg irq_cfgx[NR_IRQS];
150int __init arch_early_irq_init(void) 146int __init arch_early_irq_init(void)
151{ 147{
152 struct irq_cfg *cfg; 148 struct irq_cfg *cfg;
153 struct irq_desc *desc; 149 int count, node, i;
154 int count;
155 int node;
156 int i;
157 150
158 if (!legacy_pic->nr_legacy_irqs) { 151 if (!legacy_pic->nr_legacy_irqs) {
159 nr_irqs_gsi = 0; 152 nr_irqs_gsi = 0;
@@ -162,13 +155,15 @@ int __init arch_early_irq_init(void)
162 155
163 cfg = irq_cfgx; 156 cfg = irq_cfgx;
164 count = ARRAY_SIZE(irq_cfgx); 157 count = ARRAY_SIZE(irq_cfgx);
165 node= cpu_to_node(boot_cpu_id); 158 node = cpu_to_node(0);
159
160 /* Make sure the legacy interrupts are marked in the bitmap */
161 irq_reserve_irqs(0, legacy_pic->nr_legacy_irqs);
166 162
167 for (i = 0; i < count; i++) { 163 for (i = 0; i < count; i++) {
168 desc = irq_to_desc(i); 164 set_irq_chip_data(i, &cfg[i]);
169 desc->chip_data = &cfg[i]; 165 zalloc_cpumask_var_node(&cfg[i].domain, GFP_KERNEL, node);
170 zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); 166 zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_KERNEL, node);
171 zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node);
172 /* 167 /*
173 * For legacy IRQ's, start with assigning irq0 to irq15 to 168 * For legacy IRQ's, start with assigning irq0 to irq15 to
174 * IRQ0_VECTOR to IRQ15_VECTOR on cpu 0. 169 * IRQ0_VECTOR to IRQ15_VECTOR on cpu 0.
@@ -183,170 +178,88 @@ int __init arch_early_irq_init(void)
183} 178}
184 179
185#ifdef CONFIG_SPARSE_IRQ 180#ifdef CONFIG_SPARSE_IRQ
186struct irq_cfg *irq_cfg(unsigned int irq) 181static struct irq_cfg *irq_cfg(unsigned int irq)
187{ 182{
188 struct irq_cfg *cfg = NULL; 183 return get_irq_chip_data(irq);
189 struct irq_desc *desc;
190
191 desc = irq_to_desc(irq);
192 if (desc)
193 cfg = desc->chip_data;
194
195 return cfg;
196} 184}
197 185
198static struct irq_cfg *get_one_free_irq_cfg(int node) 186static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node)
199{ 187{
200 struct irq_cfg *cfg; 188 struct irq_cfg *cfg;
201 189
202 cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node); 190 cfg = kzalloc_node(sizeof(*cfg), GFP_KERNEL, node);
203 if (cfg) { 191 if (!cfg)
204 if (!zalloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) { 192 return NULL;
205 kfree(cfg); 193 if (!zalloc_cpumask_var_node(&cfg->domain, GFP_KERNEL, node))
206 cfg = NULL; 194 goto out_cfg;
207 } else if (!zalloc_cpumask_var_node(&cfg->old_domain, 195 if (!zalloc_cpumask_var_node(&cfg->old_domain, GFP_KERNEL, node))
208 GFP_ATOMIC, node)) { 196 goto out_domain;
209 free_cpumask_var(cfg->domain);
210 kfree(cfg);
211 cfg = NULL;
212 }
213 }
214
215 return cfg; 197 return cfg;
198out_domain:
199 free_cpumask_var(cfg->domain);
200out_cfg:
201 kfree(cfg);
202 return NULL;
216} 203}
217 204
218int arch_init_chip_data(struct irq_desc *desc, int node) 205static void free_irq_cfg(unsigned int at, struct irq_cfg *cfg)
219{
220 struct irq_cfg *cfg;
221
222 cfg = desc->chip_data;
223 if (!cfg) {
224 desc->chip_data = get_one_free_irq_cfg(node);
225 if (!desc->chip_data) {
226 printk(KERN_ERR "can not alloc irq_cfg\n");
227 BUG_ON(1);
228 }
229 }
230
231 return 0;
232}
233
234/* for move_irq_desc */
235static void
236init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int node)
237{ 206{
238 struct irq_pin_list *old_entry, *head, *tail, *entry; 207 if (!cfg)
239
240 cfg->irq_2_pin = NULL;
241 old_entry = old_cfg->irq_2_pin;
242 if (!old_entry)
243 return;
244
245 entry = get_one_free_irq_2_pin(node);
246 if (!entry)
247 return; 208 return;
209 set_irq_chip_data(at, NULL);
210 free_cpumask_var(cfg->domain);
211 free_cpumask_var(cfg->old_domain);
212 kfree(cfg);
213}
248 214
249 entry->apic = old_entry->apic; 215#else
250 entry->pin = old_entry->pin;
251 head = entry;
252 tail = entry;
253 old_entry = old_entry->next;
254 while (old_entry) {
255 entry = get_one_free_irq_2_pin(node);
256 if (!entry) {
257 entry = head;
258 while (entry) {
259 head = entry->next;
260 kfree(entry);
261 entry = head;
262 }
263 /* still use the old one */
264 return;
265 }
266 entry->apic = old_entry->apic;
267 entry->pin = old_entry->pin;
268 tail->next = entry;
269 tail = entry;
270 old_entry = old_entry->next;
271 }
272 216
273 tail->next = NULL; 217struct irq_cfg *irq_cfg(unsigned int irq)
274 cfg->irq_2_pin = head; 218{
219 return irq < nr_irqs ? irq_cfgx + irq : NULL;
275} 220}
276 221
277static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg) 222static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node)
278{ 223{
279 struct irq_pin_list *entry, *next; 224 return irq_cfgx + irq;
280 225}
281 if (old_cfg->irq_2_pin == cfg->irq_2_pin)
282 return;
283 226
284 entry = old_cfg->irq_2_pin; 227static inline void free_irq_cfg(unsigned int at, struct irq_cfg *cfg) { }
285 228
286 while (entry) { 229#endif
287 next = entry->next;
288 kfree(entry);
289 entry = next;
290 }
291 old_cfg->irq_2_pin = NULL;
292}
293 230
294void arch_init_copy_chip_data(struct irq_desc *old_desc, 231static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node)
295 struct irq_desc *desc, int node)
296{ 232{
233 int res = irq_alloc_desc_at(at, node);
297 struct irq_cfg *cfg; 234 struct irq_cfg *cfg;
298 struct irq_cfg *old_cfg;
299
300 cfg = get_one_free_irq_cfg(node);
301 235
302 if (!cfg) 236 if (res < 0) {
303 return; 237 if (res != -EEXIST)
304 238 return NULL;
305 desc->chip_data = cfg; 239 cfg = get_irq_chip_data(at);
306 240 if (cfg)
307 old_cfg = old_desc->chip_data; 241 return cfg;
308 242 }
309 cfg->vector = old_cfg->vector;
310 cfg->move_in_progress = old_cfg->move_in_progress;
311 cpumask_copy(cfg->domain, old_cfg->domain);
312 cpumask_copy(cfg->old_domain, old_cfg->old_domain);
313
314 init_copy_irq_2_pin(old_cfg, cfg, node);
315}
316 243
317static void free_irq_cfg(struct irq_cfg *cfg) 244 cfg = alloc_irq_cfg(at, node);
318{ 245 if (cfg)
319 free_cpumask_var(cfg->domain); 246 set_irq_chip_data(at, cfg);
320 free_cpumask_var(cfg->old_domain); 247 else
321 kfree(cfg); 248 irq_free_desc(at);
249 return cfg;
322} 250}
323 251
324void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc) 252static int alloc_irq_from(unsigned int from, int node)
325{ 253{
326 struct irq_cfg *old_cfg, *cfg; 254 return irq_alloc_desc_from(from, node);
327
328 old_cfg = old_desc->chip_data;
329 cfg = desc->chip_data;
330
331 if (old_cfg == cfg)
332 return;
333
334 if (old_cfg) {
335 free_irq_2_pin(old_cfg, cfg);
336 free_irq_cfg(old_cfg);
337 old_desc->chip_data = NULL;
338 }
339} 255}
340/* end for move_irq_desc */
341 256
342#else 257static void free_irq_at(unsigned int at, struct irq_cfg *cfg)
343struct irq_cfg *irq_cfg(unsigned int irq)
344{ 258{
345 return irq < nr_irqs ? irq_cfgx + irq : NULL; 259 free_irq_cfg(at, cfg);
260 irq_free_desc(at);
346} 261}
347 262
348#endif
349
350struct io_apic { 263struct io_apic {
351 unsigned int index; 264 unsigned int index;
352 unsigned int unused[3]; 265 unsigned int unused[3];
@@ -451,7 +364,7 @@ __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
451 io_apic_write(apic, 0x10 + 2*pin, eu.w1); 364 io_apic_write(apic, 0x10 + 2*pin, eu.w1);
452} 365}
453 366
454void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) 367static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
455{ 368{
456 unsigned long flags; 369 unsigned long flags;
457 raw_spin_lock_irqsave(&ioapic_lock, flags); 370 raw_spin_lock_irqsave(&ioapic_lock, flags);
@@ -481,7 +394,7 @@ static void ioapic_mask_entry(int apic, int pin)
481 * fast in the common case, and fast for shared ISA-space IRQs. 394 * fast in the common case, and fast for shared ISA-space IRQs.
482 */ 395 */
483static int 396static int
484add_pin_to_irq_node_nopanic(struct irq_cfg *cfg, int node, int apic, int pin) 397__add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
485{ 398{
486 struct irq_pin_list **last, *entry; 399 struct irq_pin_list **last, *entry;
487 400
@@ -493,7 +406,7 @@ add_pin_to_irq_node_nopanic(struct irq_cfg *cfg, int node, int apic, int pin)
493 last = &entry->next; 406 last = &entry->next;
494 } 407 }
495 408
496 entry = get_one_free_irq_2_pin(node); 409 entry = alloc_irq_pin_list(node);
497 if (!entry) { 410 if (!entry) {
498 printk(KERN_ERR "can not alloc irq_pin_list (%d,%d,%d)\n", 411 printk(KERN_ERR "can not alloc irq_pin_list (%d,%d,%d)\n",
499 node, apic, pin); 412 node, apic, pin);
@@ -508,7 +421,7 @@ add_pin_to_irq_node_nopanic(struct irq_cfg *cfg, int node, int apic, int pin)
508 421
509static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) 422static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
510{ 423{
511 if (add_pin_to_irq_node_nopanic(cfg, node, apic, pin)) 424 if (__add_pin_to_irq_node(cfg, node, apic, pin))
512 panic("IO-APIC: failed to add irq-pin. Can not proceed\n"); 425 panic("IO-APIC: failed to add irq-pin. Can not proceed\n");
513} 426}
514 427
@@ -571,11 +484,6 @@ static void __unmask_and_level_IO_APIC_irq(struct irq_pin_list *entry)
571 IO_APIC_REDIR_LEVEL_TRIGGER, NULL); 484 IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
572} 485}
573 486
574static void __unmask_IO_APIC_irq(struct irq_cfg *cfg)
575{
576 io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
577}
578
579static void io_apic_sync(struct irq_pin_list *entry) 487static void io_apic_sync(struct irq_pin_list *entry)
580{ 488{
581 /* 489 /*
@@ -587,44 +495,37 @@ static void io_apic_sync(struct irq_pin_list *entry)
587 readl(&io_apic->data); 495 readl(&io_apic->data);
588} 496}
589 497
590static void __mask_IO_APIC_irq(struct irq_cfg *cfg) 498static void mask_ioapic(struct irq_cfg *cfg)
591{ 499{
500 unsigned long flags;
501
502 raw_spin_lock_irqsave(&ioapic_lock, flags);
592 io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); 503 io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
504 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
593} 505}
594 506
595static void mask_IO_APIC_irq_desc(struct irq_desc *desc) 507static void mask_ioapic_irq(struct irq_data *data)
596{ 508{
597 struct irq_cfg *cfg = desc->chip_data; 509 mask_ioapic(data->chip_data);
598 unsigned long flags; 510}
599
600 BUG_ON(!cfg);
601 511
602 raw_spin_lock_irqsave(&ioapic_lock, flags); 512static void __unmask_ioapic(struct irq_cfg *cfg)
603 __mask_IO_APIC_irq(cfg); 513{
604 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 514 io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
605} 515}
606 516
607static void unmask_IO_APIC_irq_desc(struct irq_desc *desc) 517static void unmask_ioapic(struct irq_cfg *cfg)
608{ 518{
609 struct irq_cfg *cfg = desc->chip_data;
610 unsigned long flags; 519 unsigned long flags;
611 520
612 raw_spin_lock_irqsave(&ioapic_lock, flags); 521 raw_spin_lock_irqsave(&ioapic_lock, flags);
613 __unmask_IO_APIC_irq(cfg); 522 __unmask_ioapic(cfg);
614 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 523 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
615} 524}
616 525
617static void mask_IO_APIC_irq(unsigned int irq) 526static void unmask_ioapic_irq(struct irq_data *data)
618{ 527{
619 struct irq_desc *desc = irq_to_desc(irq); 528 unmask_ioapic(data->chip_data);
620
621 mask_IO_APIC_irq_desc(desc);
622}
623static void unmask_IO_APIC_irq(unsigned int irq)
624{
625 struct irq_desc *desc = irq_to_desc(irq);
626
627 unmask_IO_APIC_irq_desc(desc);
628} 529}
629 530
630static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) 531static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
@@ -694,14 +595,14 @@ struct IO_APIC_route_entry **alloc_ioapic_entries(void)
694 struct IO_APIC_route_entry **ioapic_entries; 595 struct IO_APIC_route_entry **ioapic_entries;
695 596
696 ioapic_entries = kzalloc(sizeof(*ioapic_entries) * nr_ioapics, 597 ioapic_entries = kzalloc(sizeof(*ioapic_entries) * nr_ioapics,
697 GFP_ATOMIC); 598 GFP_KERNEL);
698 if (!ioapic_entries) 599 if (!ioapic_entries)
699 return 0; 600 return 0;
700 601
701 for (apic = 0; apic < nr_ioapics; apic++) { 602 for (apic = 0; apic < nr_ioapics; apic++) {
702 ioapic_entries[apic] = 603 ioapic_entries[apic] =
703 kzalloc(sizeof(struct IO_APIC_route_entry) * 604 kzalloc(sizeof(struct IO_APIC_route_entry) *
704 nr_ioapic_registers[apic], GFP_ATOMIC); 605 nr_ioapic_registers[apic], GFP_KERNEL);
705 if (!ioapic_entries[apic]) 606 if (!ioapic_entries[apic])
706 goto nomem; 607 goto nomem;
707 } 608 }
@@ -1259,7 +1160,6 @@ void __setup_vector_irq(int cpu)
1259 /* Initialize vector_irq on a new cpu */ 1160 /* Initialize vector_irq on a new cpu */
1260 int irq, vector; 1161 int irq, vector;
1261 struct irq_cfg *cfg; 1162 struct irq_cfg *cfg;
1262 struct irq_desc *desc;
1263 1163
1264 /* 1164 /*
1265 * vector_lock will make sure that we don't run into irq vector 1165 * vector_lock will make sure that we don't run into irq vector
@@ -1268,9 +1168,10 @@ void __setup_vector_irq(int cpu)
1268 */ 1168 */
1269 raw_spin_lock(&vector_lock); 1169 raw_spin_lock(&vector_lock);
1270 /* Mark the inuse vectors */ 1170 /* Mark the inuse vectors */
1271 for_each_irq_desc(irq, desc) { 1171 for_each_active_irq(irq) {
1272 cfg = desc->chip_data; 1172 cfg = get_irq_chip_data(irq);
1273 1173 if (!cfg)
1174 continue;
1274 /* 1175 /*
1275 * If it is a legacy IRQ handled by the legacy PIC, this cpu 1176 * If it is a legacy IRQ handled by the legacy PIC, this cpu
1276 * will be part of the irq_cfg's domain. 1177 * will be part of the irq_cfg's domain.
@@ -1327,17 +1228,17 @@ static inline int IO_APIC_irq_trigger(int irq)
1327} 1228}
1328#endif 1229#endif
1329 1230
1330static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long trigger) 1231static void ioapic_register_intr(unsigned int irq, unsigned long trigger)
1331{ 1232{
1332 1233
1333 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || 1234 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
1334 trigger == IOAPIC_LEVEL) 1235 trigger == IOAPIC_LEVEL)
1335 desc->status |= IRQ_LEVEL; 1236 irq_set_status_flags(irq, IRQ_LEVEL);
1336 else 1237 else
1337 desc->status &= ~IRQ_LEVEL; 1238 irq_clear_status_flags(irq, IRQ_LEVEL);
1338 1239
1339 if (irq_remapped(irq)) { 1240 if (irq_remapped(get_irq_chip_data(irq))) {
1340 desc->status |= IRQ_MOVE_PCNTXT; 1241 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
1341 if (trigger) 1242 if (trigger)
1342 set_irq_chip_and_handler_name(irq, &ir_ioapic_chip, 1243 set_irq_chip_and_handler_name(irq, &ir_ioapic_chip,
1343 handle_fasteoi_irq, 1244 handle_fasteoi_irq,
@@ -1358,10 +1259,10 @@ static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long t
1358 handle_edge_irq, "edge"); 1259 handle_edge_irq, "edge");
1359} 1260}
1360 1261
1361int setup_ioapic_entry(int apic_id, int irq, 1262static int setup_ioapic_entry(int apic_id, int irq,
1362 struct IO_APIC_route_entry *entry, 1263 struct IO_APIC_route_entry *entry,
1363 unsigned int destination, int trigger, 1264 unsigned int destination, int trigger,
1364 int polarity, int vector, int pin) 1265 int polarity, int vector, int pin)
1365{ 1266{
1366 /* 1267 /*
1367 * add it to the IO-APIC irq-routing table: 1268 * add it to the IO-APIC irq-routing table:
@@ -1382,21 +1283,7 @@ int setup_ioapic_entry(int apic_id, int irq,
1382 if (index < 0) 1283 if (index < 0)
1383 panic("Failed to allocate IRTE for ioapic %d\n", apic_id); 1284 panic("Failed to allocate IRTE for ioapic %d\n", apic_id);
1384 1285
1385 memset(&irte, 0, sizeof(irte)); 1286 prepare_irte(&irte, vector, destination);
1386
1387 irte.present = 1;
1388 irte.dst_mode = apic->irq_dest_mode;
1389 /*
1390 * Trigger mode in the IRTE will always be edge, and the
1391 * actual level or edge trigger will be setup in the IO-APIC
1392 * RTE. This will help simplify level triggered irq migration.
1393 * For more details, see the comments above explainig IO-APIC
1394 * irq migration in the presence of interrupt-remapping.
1395 */
1396 irte.trigger_mode = 0;
1397 irte.dlvry_mode = apic->irq_delivery_mode;
1398 irte.vector = vector;
1399 irte.dest_id = IRTE_DEST(destination);
1400 1287
1401 /* Set source-id of interrupt request */ 1288 /* Set source-id of interrupt request */
1402 set_ioapic_sid(&irte, apic_id); 1289 set_ioapic_sid(&irte, apic_id);
@@ -1431,18 +1318,14 @@ int setup_ioapic_entry(int apic_id, int irq,
1431 return 0; 1318 return 0;
1432} 1319}
1433 1320
1434static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq_desc *desc, 1321static void setup_ioapic_irq(int apic_id, int pin, unsigned int irq,
1435 int trigger, int polarity) 1322 struct irq_cfg *cfg, int trigger, int polarity)
1436{ 1323{
1437 struct irq_cfg *cfg;
1438 struct IO_APIC_route_entry entry; 1324 struct IO_APIC_route_entry entry;
1439 unsigned int dest; 1325 unsigned int dest;
1440 1326
1441 if (!IO_APIC_IRQ(irq)) 1327 if (!IO_APIC_IRQ(irq))
1442 return; 1328 return;
1443
1444 cfg = desc->chip_data;
1445
1446 /* 1329 /*
1447 * For legacy irqs, cfg->domain starts with cpu 0 for legacy 1330 * For legacy irqs, cfg->domain starts with cpu 0 for legacy
1448 * controllers like 8259. Now that IO-APIC can handle this irq, update 1331 * controllers like 8259. Now that IO-APIC can handle this irq, update
@@ -1471,9 +1354,9 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq
1471 return; 1354 return;
1472 } 1355 }
1473 1356
1474 ioapic_register_intr(irq, desc, trigger); 1357 ioapic_register_intr(irq, trigger);
1475 if (irq < legacy_pic->nr_legacy_irqs) 1358 if (irq < legacy_pic->nr_legacy_irqs)
1476 legacy_pic->chip->mask(irq); 1359 legacy_pic->mask(irq);
1477 1360
1478 ioapic_write_entry(apic_id, pin, entry); 1361 ioapic_write_entry(apic_id, pin, entry);
1479} 1362}
@@ -1484,11 +1367,9 @@ static struct {
1484 1367
1485static void __init setup_IO_APIC_irqs(void) 1368static void __init setup_IO_APIC_irqs(void)
1486{ 1369{
1487 int apic_id, pin, idx, irq; 1370 int apic_id, pin, idx, irq, notcon = 0;
1488 int notcon = 0; 1371 int node = cpu_to_node(0);
1489 struct irq_desc *desc;
1490 struct irq_cfg *cfg; 1372 struct irq_cfg *cfg;
1491 int node = cpu_to_node(boot_cpu_id);
1492 1373
1493 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); 1374 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
1494 1375
@@ -1525,19 +1406,17 @@ static void __init setup_IO_APIC_irqs(void)
1525 apic->multi_timer_check(apic_id, irq)) 1406 apic->multi_timer_check(apic_id, irq))
1526 continue; 1407 continue;
1527 1408
1528 desc = irq_to_desc_alloc_node(irq, node); 1409 cfg = alloc_irq_and_cfg_at(irq, node);
1529 if (!desc) { 1410 if (!cfg)
1530 printk(KERN_INFO "can not get irq_desc for %d\n", irq);
1531 continue; 1411 continue;
1532 } 1412
1533 cfg = desc->chip_data;
1534 add_pin_to_irq_node(cfg, node, apic_id, pin); 1413 add_pin_to_irq_node(cfg, node, apic_id, pin);
1535 /* 1414 /*
1536 * don't mark it in pin_programmed, so later acpi could 1415 * don't mark it in pin_programmed, so later acpi could
1537 * set it correctly when irq < 16 1416 * set it correctly when irq < 16
1538 */ 1417 */
1539 setup_IO_APIC_irq(apic_id, pin, irq, desc, 1418 setup_ioapic_irq(apic_id, pin, irq, cfg, irq_trigger(idx),
1540 irq_trigger(idx), irq_polarity(idx)); 1419 irq_polarity(idx));
1541 } 1420 }
1542 1421
1543 if (notcon) 1422 if (notcon)
@@ -1552,9 +1431,7 @@ static void __init setup_IO_APIC_irqs(void)
1552 */ 1431 */
1553void setup_IO_APIC_irq_extra(u32 gsi) 1432void setup_IO_APIC_irq_extra(u32 gsi)
1554{ 1433{
1555 int apic_id = 0, pin, idx, irq; 1434 int apic_id = 0, pin, idx, irq, node = cpu_to_node(0);
1556 int node = cpu_to_node(boot_cpu_id);
1557 struct irq_desc *desc;
1558 struct irq_cfg *cfg; 1435 struct irq_cfg *cfg;
1559 1436
1560 /* 1437 /*
@@ -1570,18 +1447,15 @@ void setup_IO_APIC_irq_extra(u32 gsi)
1570 return; 1447 return;
1571 1448
1572 irq = pin_2_irq(idx, apic_id, pin); 1449 irq = pin_2_irq(idx, apic_id, pin);
1573#ifdef CONFIG_SPARSE_IRQ 1450
1574 desc = irq_to_desc(irq); 1451 /* Only handle the non legacy irqs on secondary ioapics */
1575 if (desc) 1452 if (apic_id == 0 || irq < NR_IRQS_LEGACY)
1576 return; 1453 return;
1577#endif 1454
1578 desc = irq_to_desc_alloc_node(irq, node); 1455 cfg = alloc_irq_and_cfg_at(irq, node);
1579 if (!desc) { 1456 if (!cfg)
1580 printk(KERN_INFO "can not get irq_desc for %d\n", irq);
1581 return; 1457 return;
1582 }
1583 1458
1584 cfg = desc->chip_data;
1585 add_pin_to_irq_node(cfg, node, apic_id, pin); 1459 add_pin_to_irq_node(cfg, node, apic_id, pin);
1586 1460
1587 if (test_bit(pin, mp_ioapic_routing[apic_id].pin_programmed)) { 1461 if (test_bit(pin, mp_ioapic_routing[apic_id].pin_programmed)) {
@@ -1591,7 +1465,7 @@ void setup_IO_APIC_irq_extra(u32 gsi)
1591 } 1465 }
1592 set_bit(pin, mp_ioapic_routing[apic_id].pin_programmed); 1466 set_bit(pin, mp_ioapic_routing[apic_id].pin_programmed);
1593 1467
1594 setup_IO_APIC_irq(apic_id, pin, irq, desc, 1468 setup_ioapic_irq(apic_id, pin, irq, cfg,
1595 irq_trigger(idx), irq_polarity(idx)); 1469 irq_trigger(idx), irq_polarity(idx));
1596} 1470}
1597 1471
@@ -1642,7 +1516,6 @@ __apicdebuginit(void) print_IO_APIC(void)
1642 union IO_APIC_reg_03 reg_03; 1516 union IO_APIC_reg_03 reg_03;
1643 unsigned long flags; 1517 unsigned long flags;
1644 struct irq_cfg *cfg; 1518 struct irq_cfg *cfg;
1645 struct irq_desc *desc;
1646 unsigned int irq; 1519 unsigned int irq;
1647 1520
1648 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); 1521 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
@@ -1729,10 +1602,10 @@ __apicdebuginit(void) print_IO_APIC(void)
1729 } 1602 }
1730 } 1603 }
1731 printk(KERN_DEBUG "IRQ to pin mappings:\n"); 1604 printk(KERN_DEBUG "IRQ to pin mappings:\n");
1732 for_each_irq_desc(irq, desc) { 1605 for_each_active_irq(irq) {
1733 struct irq_pin_list *entry; 1606 struct irq_pin_list *entry;
1734 1607
1735 cfg = desc->chip_data; 1608 cfg = get_irq_chip_data(irq);
1736 if (!cfg) 1609 if (!cfg)
1737 continue; 1610 continue;
1738 entry = cfg->irq_2_pin; 1611 entry = cfg->irq_2_pin;
@@ -2239,29 +2112,26 @@ static int __init timer_irq_works(void)
2239 * an edge even if it isn't on the 8259A... 2112 * an edge even if it isn't on the 8259A...
2240 */ 2113 */
2241 2114
2242static unsigned int startup_ioapic_irq(unsigned int irq) 2115static unsigned int startup_ioapic_irq(struct irq_data *data)
2243{ 2116{
2244 int was_pending = 0; 2117 int was_pending = 0, irq = data->irq;
2245 unsigned long flags; 2118 unsigned long flags;
2246 struct irq_cfg *cfg;
2247 2119
2248 raw_spin_lock_irqsave(&ioapic_lock, flags); 2120 raw_spin_lock_irqsave(&ioapic_lock, flags);
2249 if (irq < legacy_pic->nr_legacy_irqs) { 2121 if (irq < legacy_pic->nr_legacy_irqs) {
2250 legacy_pic->chip->mask(irq); 2122 legacy_pic->mask(irq);
2251 if (legacy_pic->irq_pending(irq)) 2123 if (legacy_pic->irq_pending(irq))
2252 was_pending = 1; 2124 was_pending = 1;
2253 } 2125 }
2254 cfg = irq_cfg(irq); 2126 __unmask_ioapic(data->chip_data);
2255 __unmask_IO_APIC_irq(cfg);
2256 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 2127 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2257 2128
2258 return was_pending; 2129 return was_pending;
2259} 2130}
2260 2131
2261static int ioapic_retrigger_irq(unsigned int irq) 2132static int ioapic_retrigger_irq(struct irq_data *data)
2262{ 2133{
2263 2134 struct irq_cfg *cfg = data->chip_data;
2264 struct irq_cfg *cfg = irq_cfg(irq);
2265 unsigned long flags; 2135 unsigned long flags;
2266 2136
2267 raw_spin_lock_irqsave(&vector_lock, flags); 2137 raw_spin_lock_irqsave(&vector_lock, flags);
@@ -2312,7 +2182,7 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq
2312 * With interrupt-remapping, destination information comes 2182 * With interrupt-remapping, destination information comes
2313 * from interrupt-remapping table entry. 2183 * from interrupt-remapping table entry.
2314 */ 2184 */
2315 if (!irq_remapped(irq)) 2185 if (!irq_remapped(cfg))
2316 io_apic_write(apic, 0x11 + pin*2, dest); 2186 io_apic_write(apic, 0x11 + pin*2, dest);
2317 reg = io_apic_read(apic, 0x10 + pin*2); 2187 reg = io_apic_read(apic, 0x10 + pin*2);
2318 reg &= ~IO_APIC_REDIR_VECTOR_MASK; 2188 reg &= ~IO_APIC_REDIR_VECTOR_MASK;
@@ -2322,65 +2192,46 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq
2322} 2192}
2323 2193
2324/* 2194/*
2325 * Either sets desc->affinity to a valid value, and returns 2195 * Either sets data->affinity to a valid value, and returns
2326 * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and 2196 * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and
2327 * leaves desc->affinity untouched. 2197 * leaves data->affinity untouched.
2328 */ 2198 */
2329unsigned int 2199int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
2330set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask, 2200 unsigned int *dest_id)
2331 unsigned int *dest_id)
2332{ 2201{
2333 struct irq_cfg *cfg; 2202 struct irq_cfg *cfg = data->chip_data;
2334 unsigned int irq;
2335 2203
2336 if (!cpumask_intersects(mask, cpu_online_mask)) 2204 if (!cpumask_intersects(mask, cpu_online_mask))
2337 return -1; 2205 return -1;
2338 2206
2339 irq = desc->irq; 2207 if (assign_irq_vector(data->irq, data->chip_data, mask))
2340 cfg = desc->chip_data;
2341 if (assign_irq_vector(irq, cfg, mask))
2342 return -1; 2208 return -1;
2343 2209
2344 cpumask_copy(desc->affinity, mask); 2210 cpumask_copy(data->affinity, mask);
2345 2211
2346 *dest_id = apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain); 2212 *dest_id = apic->cpu_mask_to_apicid_and(mask, cfg->domain);
2347 return 0; 2213 return 0;
2348} 2214}
2349 2215
2350static int 2216static int
2351set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) 2217ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
2218 bool force)
2352{ 2219{
2353 struct irq_cfg *cfg; 2220 unsigned int dest, irq = data->irq;
2354 unsigned long flags; 2221 unsigned long flags;
2355 unsigned int dest; 2222 int ret;
2356 unsigned int irq;
2357 int ret = -1;
2358
2359 irq = desc->irq;
2360 cfg = desc->chip_data;
2361 2223
2362 raw_spin_lock_irqsave(&ioapic_lock, flags); 2224 raw_spin_lock_irqsave(&ioapic_lock, flags);
2363 ret = set_desc_affinity(desc, mask, &dest); 2225 ret = __ioapic_set_affinity(data, mask, &dest);
2364 if (!ret) { 2226 if (!ret) {
2365 /* Only the high 8 bits are valid. */ 2227 /* Only the high 8 bits are valid. */
2366 dest = SET_APIC_LOGICAL_ID(dest); 2228 dest = SET_APIC_LOGICAL_ID(dest);
2367 __target_IO_APIC_irq(irq, dest, cfg); 2229 __target_IO_APIC_irq(irq, dest, data->chip_data);
2368 } 2230 }
2369 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 2231 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2370
2371 return ret; 2232 return ret;
2372} 2233}
2373 2234
2374static int
2375set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
2376{
2377 struct irq_desc *desc;
2378
2379 desc = irq_to_desc(irq);
2380
2381 return set_ioapic_affinity_irq_desc(desc, mask);
2382}
2383
2384#ifdef CONFIG_INTR_REMAP 2235#ifdef CONFIG_INTR_REMAP
2385 2236
2386/* 2237/*
@@ -2395,24 +2246,21 @@ set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
2395 * the interrupt-remapping table entry. 2246 * the interrupt-remapping table entry.
2396 */ 2247 */
2397static int 2248static int
2398migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask) 2249ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
2250 bool force)
2399{ 2251{
2400 struct irq_cfg *cfg; 2252 struct irq_cfg *cfg = data->chip_data;
2253 unsigned int dest, irq = data->irq;
2401 struct irte irte; 2254 struct irte irte;
2402 unsigned int dest;
2403 unsigned int irq;
2404 int ret = -1;
2405 2255
2406 if (!cpumask_intersects(mask, cpu_online_mask)) 2256 if (!cpumask_intersects(mask, cpu_online_mask))
2407 return ret; 2257 return -EINVAL;
2408 2258
2409 irq = desc->irq;
2410 if (get_irte(irq, &irte)) 2259 if (get_irte(irq, &irte))
2411 return ret; 2260 return -EBUSY;
2412 2261
2413 cfg = desc->chip_data;
2414 if (assign_irq_vector(irq, cfg, mask)) 2262 if (assign_irq_vector(irq, cfg, mask))
2415 return ret; 2263 return -EBUSY;
2416 2264
2417 dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask); 2265 dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
2418 2266
@@ -2427,29 +2275,14 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
2427 if (cfg->move_in_progress) 2275 if (cfg->move_in_progress)
2428 send_cleanup_vector(cfg); 2276 send_cleanup_vector(cfg);
2429 2277
2430 cpumask_copy(desc->affinity, mask); 2278 cpumask_copy(data->affinity, mask);
2431
2432 return 0; 2279 return 0;
2433} 2280}
2434 2281
2435/*
2436 * Migrates the IRQ destination in the process context.
2437 */
2438static int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
2439 const struct cpumask *mask)
2440{
2441 return migrate_ioapic_irq_desc(desc, mask);
2442}
2443static int set_ir_ioapic_affinity_irq(unsigned int irq,
2444 const struct cpumask *mask)
2445{
2446 struct irq_desc *desc = irq_to_desc(irq);
2447
2448 return set_ir_ioapic_affinity_irq_desc(desc, mask);
2449}
2450#else 2282#else
2451static inline int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, 2283static inline int
2452 const struct cpumask *mask) 2284ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
2285 bool force)
2453{ 2286{
2454 return 0; 2287 return 0;
2455} 2288}
@@ -2511,10 +2344,8 @@ unlock:
2511 irq_exit(); 2344 irq_exit();
2512} 2345}
2513 2346
2514static void __irq_complete_move(struct irq_desc **descp, unsigned vector) 2347static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector)
2515{ 2348{
2516 struct irq_desc *desc = *descp;
2517 struct irq_cfg *cfg = desc->chip_data;
2518 unsigned me; 2349 unsigned me;
2519 2350
2520 if (likely(!cfg->move_in_progress)) 2351 if (likely(!cfg->move_in_progress))
@@ -2526,31 +2357,28 @@ static void __irq_complete_move(struct irq_desc **descp, unsigned vector)
2526 send_cleanup_vector(cfg); 2357 send_cleanup_vector(cfg);
2527} 2358}
2528 2359
2529static void irq_complete_move(struct irq_desc **descp) 2360static void irq_complete_move(struct irq_cfg *cfg)
2530{ 2361{
2531 __irq_complete_move(descp, ~get_irq_regs()->orig_ax); 2362 __irq_complete_move(cfg, ~get_irq_regs()->orig_ax);
2532} 2363}
2533 2364
2534void irq_force_complete_move(int irq) 2365void irq_force_complete_move(int irq)
2535{ 2366{
2536 struct irq_desc *desc = irq_to_desc(irq); 2367 struct irq_cfg *cfg = get_irq_chip_data(irq);
2537 struct irq_cfg *cfg = desc->chip_data;
2538 2368
2539 if (!cfg) 2369 if (!cfg)
2540 return; 2370 return;
2541 2371
2542 __irq_complete_move(&desc, cfg->vector); 2372 __irq_complete_move(cfg, cfg->vector);
2543} 2373}
2544#else 2374#else
2545static inline void irq_complete_move(struct irq_desc **descp) {} 2375static inline void irq_complete_move(struct irq_cfg *cfg) { }
2546#endif 2376#endif
2547 2377
2548static void ack_apic_edge(unsigned int irq) 2378static void ack_apic_edge(struct irq_data *data)
2549{ 2379{
2550 struct irq_desc *desc = irq_to_desc(irq); 2380 irq_complete_move(data->chip_data);
2551 2381 move_native_irq(data->irq);
2552 irq_complete_move(&desc);
2553 move_native_irq(irq);
2554 ack_APIC_irq(); 2382 ack_APIC_irq();
2555} 2383}
2556 2384
@@ -2572,10 +2400,12 @@ atomic_t irq_mis_count;
2572 * Otherwise, we simulate the EOI message manually by changing the trigger 2400 * Otherwise, we simulate the EOI message manually by changing the trigger
2573 * mode to edge and then back to level, with RTE being masked during this. 2401 * mode to edge and then back to level, with RTE being masked during this.
2574*/ 2402*/
2575static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) 2403static void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
2576{ 2404{
2577 struct irq_pin_list *entry; 2405 struct irq_pin_list *entry;
2406 unsigned long flags;
2578 2407
2408 raw_spin_lock_irqsave(&ioapic_lock, flags);
2579 for_each_irq_pin(entry, cfg->irq_2_pin) { 2409 for_each_irq_pin(entry, cfg->irq_2_pin) {
2580 if (mp_ioapics[entry->apic].apicver >= 0x20) { 2410 if (mp_ioapics[entry->apic].apicver >= 0x20) {
2581 /* 2411 /*
@@ -2584,7 +2414,7 @@ static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
2584 * intr-remapping table entry. Hence for the io-apic 2414 * intr-remapping table entry. Hence for the io-apic
2585 * EOI we use the pin number. 2415 * EOI we use the pin number.
2586 */ 2416 */
2587 if (irq_remapped(irq)) 2417 if (irq_remapped(cfg))
2588 io_apic_eoi(entry->apic, entry->pin); 2418 io_apic_eoi(entry->apic, entry->pin);
2589 else 2419 else
2590 io_apic_eoi(entry->apic, cfg->vector); 2420 io_apic_eoi(entry->apic, cfg->vector);
@@ -2593,36 +2423,22 @@ static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
2593 __unmask_and_level_IO_APIC_irq(entry); 2423 __unmask_and_level_IO_APIC_irq(entry);
2594 } 2424 }
2595 } 2425 }
2596}
2597
2598static void eoi_ioapic_irq(struct irq_desc *desc)
2599{
2600 struct irq_cfg *cfg;
2601 unsigned long flags;
2602 unsigned int irq;
2603
2604 irq = desc->irq;
2605 cfg = desc->chip_data;
2606
2607 raw_spin_lock_irqsave(&ioapic_lock, flags);
2608 __eoi_ioapic_irq(irq, cfg);
2609 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 2426 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2610} 2427}
2611 2428
2612static void ack_apic_level(unsigned int irq) 2429static void ack_apic_level(struct irq_data *data)
2613{ 2430{
2431 struct irq_cfg *cfg = data->chip_data;
2432 int i, do_unmask_irq = 0, irq = data->irq;
2614 struct irq_desc *desc = irq_to_desc(irq); 2433 struct irq_desc *desc = irq_to_desc(irq);
2615 unsigned long v; 2434 unsigned long v;
2616 int i;
2617 struct irq_cfg *cfg;
2618 int do_unmask_irq = 0;
2619 2435
2620 irq_complete_move(&desc); 2436 irq_complete_move(cfg);
2621#ifdef CONFIG_GENERIC_PENDING_IRQ 2437#ifdef CONFIG_GENERIC_PENDING_IRQ
2622 /* If we are moving the irq we need to mask it */ 2438 /* If we are moving the irq we need to mask it */
2623 if (unlikely(desc->status & IRQ_MOVE_PENDING)) { 2439 if (unlikely(desc->status & IRQ_MOVE_PENDING)) {
2624 do_unmask_irq = 1; 2440 do_unmask_irq = 1;
2625 mask_IO_APIC_irq_desc(desc); 2441 mask_ioapic(cfg);
2626 } 2442 }
2627#endif 2443#endif
2628 2444
@@ -2658,7 +2474,6 @@ static void ack_apic_level(unsigned int irq)
2658 * we use the above logic (mask+edge followed by unmask+level) from 2474 * we use the above logic (mask+edge followed by unmask+level) from
2659 * Manfred Spraul to clear the remote IRR. 2475 * Manfred Spraul to clear the remote IRR.
2660 */ 2476 */
2661 cfg = desc->chip_data;
2662 i = cfg->vector; 2477 i = cfg->vector;
2663 v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); 2478 v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
2664 2479
@@ -2678,7 +2493,7 @@ static void ack_apic_level(unsigned int irq)
2678 if (!(v & (1 << (i & 0x1f)))) { 2493 if (!(v & (1 << (i & 0x1f)))) {
2679 atomic_inc(&irq_mis_count); 2494 atomic_inc(&irq_mis_count);
2680 2495
2681 eoi_ioapic_irq(desc); 2496 eoi_ioapic_irq(irq, cfg);
2682 } 2497 }
2683 2498
2684 /* Now we can move and renable the irq */ 2499 /* Now we can move and renable the irq */
@@ -2709,61 +2524,57 @@ static void ack_apic_level(unsigned int irq)
2709 * accurate and is causing problems then it is a hardware bug 2524 * accurate and is causing problems then it is a hardware bug
2710 * and you can go talk to the chipset vendor about it. 2525 * and you can go talk to the chipset vendor about it.
2711 */ 2526 */
2712 cfg = desc->chip_data;
2713 if (!io_apic_level_ack_pending(cfg)) 2527 if (!io_apic_level_ack_pending(cfg))
2714 move_masked_irq(irq); 2528 move_masked_irq(irq);
2715 unmask_IO_APIC_irq_desc(desc); 2529 unmask_ioapic(cfg);
2716 } 2530 }
2717} 2531}
2718 2532
2719#ifdef CONFIG_INTR_REMAP 2533#ifdef CONFIG_INTR_REMAP
2720static void ir_ack_apic_edge(unsigned int irq) 2534static void ir_ack_apic_edge(struct irq_data *data)
2721{ 2535{
2722 ack_APIC_irq(); 2536 ack_APIC_irq();
2723} 2537}
2724 2538
2725static void ir_ack_apic_level(unsigned int irq) 2539static void ir_ack_apic_level(struct irq_data *data)
2726{ 2540{
2727 struct irq_desc *desc = irq_to_desc(irq);
2728
2729 ack_APIC_irq(); 2541 ack_APIC_irq();
2730 eoi_ioapic_irq(desc); 2542 eoi_ioapic_irq(data->irq, data->chip_data);
2731} 2543}
2732#endif /* CONFIG_INTR_REMAP */ 2544#endif /* CONFIG_INTR_REMAP */
2733 2545
2734static struct irq_chip ioapic_chip __read_mostly = { 2546static struct irq_chip ioapic_chip __read_mostly = {
2735 .name = "IO-APIC", 2547 .name = "IO-APIC",
2736 .startup = startup_ioapic_irq, 2548 .irq_startup = startup_ioapic_irq,
2737 .mask = mask_IO_APIC_irq, 2549 .irq_mask = mask_ioapic_irq,
2738 .unmask = unmask_IO_APIC_irq, 2550 .irq_unmask = unmask_ioapic_irq,
2739 .ack = ack_apic_edge, 2551 .irq_ack = ack_apic_edge,
2740 .eoi = ack_apic_level, 2552 .irq_eoi = ack_apic_level,
2741#ifdef CONFIG_SMP 2553#ifdef CONFIG_SMP
2742 .set_affinity = set_ioapic_affinity_irq, 2554 .irq_set_affinity = ioapic_set_affinity,
2743#endif 2555#endif
2744 .retrigger = ioapic_retrigger_irq, 2556 .irq_retrigger = ioapic_retrigger_irq,
2745}; 2557};
2746 2558
2747static struct irq_chip ir_ioapic_chip __read_mostly = { 2559static struct irq_chip ir_ioapic_chip __read_mostly = {
2748 .name = "IR-IO-APIC", 2560 .name = "IR-IO-APIC",
2749 .startup = startup_ioapic_irq, 2561 .irq_startup = startup_ioapic_irq,
2750 .mask = mask_IO_APIC_irq, 2562 .irq_mask = mask_ioapic_irq,
2751 .unmask = unmask_IO_APIC_irq, 2563 .irq_unmask = unmask_ioapic_irq,
2752#ifdef CONFIG_INTR_REMAP 2564#ifdef CONFIG_INTR_REMAP
2753 .ack = ir_ack_apic_edge, 2565 .irq_ack = ir_ack_apic_edge,
2754 .eoi = ir_ack_apic_level, 2566 .irq_eoi = ir_ack_apic_level,
2755#ifdef CONFIG_SMP 2567#ifdef CONFIG_SMP
2756 .set_affinity = set_ir_ioapic_affinity_irq, 2568 .irq_set_affinity = ir_ioapic_set_affinity,
2757#endif 2569#endif
2758#endif 2570#endif
2759 .retrigger = ioapic_retrigger_irq, 2571 .irq_retrigger = ioapic_retrigger_irq,
2760}; 2572};
2761 2573
2762static inline void init_IO_APIC_traps(void) 2574static inline void init_IO_APIC_traps(void)
2763{ 2575{
2764 int irq;
2765 struct irq_desc *desc;
2766 struct irq_cfg *cfg; 2576 struct irq_cfg *cfg;
2577 unsigned int irq;
2767 2578
2768 /* 2579 /*
2769 * NOTE! The local APIC isn't very good at handling 2580 * NOTE! The local APIC isn't very good at handling
@@ -2776,8 +2587,8 @@ static inline void init_IO_APIC_traps(void)
2776 * Also, we've got to be careful not to trash gate 2587 * Also, we've got to be careful not to trash gate
2777 * 0x80, because int 0x80 is hm, kind of importantish. ;) 2588 * 0x80, because int 0x80 is hm, kind of importantish. ;)
2778 */ 2589 */
2779 for_each_irq_desc(irq, desc) { 2590 for_each_active_irq(irq) {
2780 cfg = desc->chip_data; 2591 cfg = get_irq_chip_data(irq);
2781 if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) { 2592 if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
2782 /* 2593 /*
2783 * Hmm.. We don't have an entry for this, 2594 * Hmm.. We don't have an entry for this,
@@ -2788,7 +2599,7 @@ static inline void init_IO_APIC_traps(void)
2788 legacy_pic->make_irq(irq); 2599 legacy_pic->make_irq(irq);
2789 else 2600 else
2790 /* Strange. Oh, well.. */ 2601 /* Strange. Oh, well.. */
2791 desc->chip = &no_irq_chip; 2602 set_irq_chip(irq, &no_irq_chip);
2792 } 2603 }
2793 } 2604 }
2794} 2605}
@@ -2797,7 +2608,7 @@ static inline void init_IO_APIC_traps(void)
2797 * The local APIC irq-chip implementation: 2608 * The local APIC irq-chip implementation:
2798 */ 2609 */
2799 2610
2800static void mask_lapic_irq(unsigned int irq) 2611static void mask_lapic_irq(struct irq_data *data)
2801{ 2612{
2802 unsigned long v; 2613 unsigned long v;
2803 2614
@@ -2805,7 +2616,7 @@ static void mask_lapic_irq(unsigned int irq)
2805 apic_write(APIC_LVT0, v | APIC_LVT_MASKED); 2616 apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
2806} 2617}
2807 2618
2808static void unmask_lapic_irq(unsigned int irq) 2619static void unmask_lapic_irq(struct irq_data *data)
2809{ 2620{
2810 unsigned long v; 2621 unsigned long v;
2811 2622
@@ -2813,21 +2624,21 @@ static void unmask_lapic_irq(unsigned int irq)
2813 apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); 2624 apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
2814} 2625}
2815 2626
2816static void ack_lapic_irq(unsigned int irq) 2627static void ack_lapic_irq(struct irq_data *data)
2817{ 2628{
2818 ack_APIC_irq(); 2629 ack_APIC_irq();
2819} 2630}
2820 2631
2821static struct irq_chip lapic_chip __read_mostly = { 2632static struct irq_chip lapic_chip __read_mostly = {
2822 .name = "local-APIC", 2633 .name = "local-APIC",
2823 .mask = mask_lapic_irq, 2634 .irq_mask = mask_lapic_irq,
2824 .unmask = unmask_lapic_irq, 2635 .irq_unmask = unmask_lapic_irq,
2825 .ack = ack_lapic_irq, 2636 .irq_ack = ack_lapic_irq,
2826}; 2637};
2827 2638
2828static void lapic_register_intr(int irq, struct irq_desc *desc) 2639static void lapic_register_intr(int irq)
2829{ 2640{
2830 desc->status &= ~IRQ_LEVEL; 2641 irq_clear_status_flags(irq, IRQ_LEVEL);
2831 set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, 2642 set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
2832 "edge"); 2643 "edge");
2833} 2644}
@@ -2930,9 +2741,8 @@ int timer_through_8259 __initdata;
2930 */ 2741 */
2931static inline void __init check_timer(void) 2742static inline void __init check_timer(void)
2932{ 2743{
2933 struct irq_desc *desc = irq_to_desc(0); 2744 struct irq_cfg *cfg = get_irq_chip_data(0);
2934 struct irq_cfg *cfg = desc->chip_data; 2745 int node = cpu_to_node(0);
2935 int node = cpu_to_node(boot_cpu_id);
2936 int apic1, pin1, apic2, pin2; 2746 int apic1, pin1, apic2, pin2;
2937 unsigned long flags; 2747 unsigned long flags;
2938 int no_pin1 = 0; 2748 int no_pin1 = 0;
@@ -2942,7 +2752,7 @@ static inline void __init check_timer(void)
2942 /* 2752 /*
2943 * get/set the timer IRQ vector: 2753 * get/set the timer IRQ vector:
2944 */ 2754 */
2945 legacy_pic->chip->mask(0); 2755 legacy_pic->mask(0);
2946 assign_irq_vector(0, cfg, apic->target_cpus()); 2756 assign_irq_vector(0, cfg, apic->target_cpus());
2947 2757
2948 /* 2758 /*
@@ -3001,7 +2811,7 @@ static inline void __init check_timer(void)
3001 add_pin_to_irq_node(cfg, node, apic1, pin1); 2811 add_pin_to_irq_node(cfg, node, apic1, pin1);
3002 setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); 2812 setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
3003 } else { 2813 } else {
3004 /* for edge trigger, setup_IO_APIC_irq already 2814 /* for edge trigger, setup_ioapic_irq already
3005 * leave it unmasked. 2815 * leave it unmasked.
3006 * so only need to unmask if it is level-trigger 2816 * so only need to unmask if it is level-trigger
3007 * do we really have level trigger timer? 2817 * do we really have level trigger timer?
@@ -3009,12 +2819,12 @@ static inline void __init check_timer(void)
3009 int idx; 2819 int idx;
3010 idx = find_irq_entry(apic1, pin1, mp_INT); 2820 idx = find_irq_entry(apic1, pin1, mp_INT);
3011 if (idx != -1 && irq_trigger(idx)) 2821 if (idx != -1 && irq_trigger(idx))
3012 unmask_IO_APIC_irq_desc(desc); 2822 unmask_ioapic(cfg);
3013 } 2823 }
3014 if (timer_irq_works()) { 2824 if (timer_irq_works()) {
3015 if (nmi_watchdog == NMI_IO_APIC) { 2825 if (nmi_watchdog == NMI_IO_APIC) {
3016 setup_nmi(); 2826 setup_nmi();
3017 legacy_pic->chip->unmask(0); 2827 legacy_pic->unmask(0);
3018 } 2828 }
3019 if (disable_timer_pin_1 > 0) 2829 if (disable_timer_pin_1 > 0)
3020 clear_IO_APIC_pin(0, pin1); 2830 clear_IO_APIC_pin(0, pin1);
@@ -3037,14 +2847,14 @@ static inline void __init check_timer(void)
3037 */ 2847 */
3038 replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2); 2848 replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2);
3039 setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); 2849 setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
3040 legacy_pic->chip->unmask(0); 2850 legacy_pic->unmask(0);
3041 if (timer_irq_works()) { 2851 if (timer_irq_works()) {
3042 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); 2852 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
3043 timer_through_8259 = 1; 2853 timer_through_8259 = 1;
3044 if (nmi_watchdog == NMI_IO_APIC) { 2854 if (nmi_watchdog == NMI_IO_APIC) {
3045 legacy_pic->chip->mask(0); 2855 legacy_pic->mask(0);
3046 setup_nmi(); 2856 setup_nmi();
3047 legacy_pic->chip->unmask(0); 2857 legacy_pic->unmask(0);
3048 } 2858 }
3049 goto out; 2859 goto out;
3050 } 2860 }
@@ -3052,7 +2862,7 @@ static inline void __init check_timer(void)
3052 * Cleanup, just in case ... 2862 * Cleanup, just in case ...
3053 */ 2863 */
3054 local_irq_disable(); 2864 local_irq_disable();
3055 legacy_pic->chip->mask(0); 2865 legacy_pic->mask(0);
3056 clear_IO_APIC_pin(apic2, pin2); 2866 clear_IO_APIC_pin(apic2, pin2);
3057 apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); 2867 apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
3058 } 2868 }
@@ -3069,16 +2879,16 @@ static inline void __init check_timer(void)
3069 apic_printk(APIC_QUIET, KERN_INFO 2879 apic_printk(APIC_QUIET, KERN_INFO
3070 "...trying to set up timer as Virtual Wire IRQ...\n"); 2880 "...trying to set up timer as Virtual Wire IRQ...\n");
3071 2881
3072 lapic_register_intr(0, desc); 2882 lapic_register_intr(0);
3073 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ 2883 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
3074 legacy_pic->chip->unmask(0); 2884 legacy_pic->unmask(0);
3075 2885
3076 if (timer_irq_works()) { 2886 if (timer_irq_works()) {
3077 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); 2887 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
3078 goto out; 2888 goto out;
3079 } 2889 }
3080 local_irq_disable(); 2890 local_irq_disable();
3081 legacy_pic->chip->mask(0); 2891 legacy_pic->mask(0);
3082 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); 2892 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
3083 apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); 2893 apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
3084 2894
@@ -3244,49 +3054,42 @@ device_initcall(ioapic_init_sysfs);
3244/* 3054/*
3245 * Dynamic irq allocate and deallocation 3055 * Dynamic irq allocate and deallocation
3246 */ 3056 */
3247unsigned int create_irq_nr(unsigned int irq_want, int node) 3057unsigned int create_irq_nr(unsigned int from, int node)
3248{ 3058{
3249 /* Allocate an unused irq */ 3059 struct irq_cfg *cfg;
3250 unsigned int irq;
3251 unsigned int new;
3252 unsigned long flags; 3060 unsigned long flags;
3253 struct irq_cfg *cfg_new = NULL; 3061 unsigned int ret = 0;
3254 struct irq_desc *desc_new = NULL; 3062 int irq;
3255
3256 irq = 0;
3257 if (irq_want < nr_irqs_gsi)
3258 irq_want = nr_irqs_gsi;
3259
3260 raw_spin_lock_irqsave(&vector_lock, flags);
3261 for (new = irq_want; new < nr_irqs; new++) {
3262 desc_new = irq_to_desc_alloc_node(new, node);
3263 if (!desc_new) {
3264 printk(KERN_INFO "can not get irq_desc for %d\n", new);
3265 continue;
3266 }
3267 cfg_new = desc_new->chip_data;
3268
3269 if (cfg_new->vector != 0)
3270 continue;
3271 3063
3272 desc_new = move_irq_desc(desc_new, node); 3064 if (from < nr_irqs_gsi)
3273 cfg_new = desc_new->chip_data; 3065 from = nr_irqs_gsi;
3274 3066
3275 if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0) 3067 irq = alloc_irq_from(from, node);
3276 irq = new; 3068 if (irq < 0)
3277 break; 3069 return 0;
3070 cfg = alloc_irq_cfg(irq, node);
3071 if (!cfg) {
3072 free_irq_at(irq, NULL);
3073 return 0;
3278 } 3074 }
3279 raw_spin_unlock_irqrestore(&vector_lock, flags);
3280 3075
3281 if (irq > 0) 3076 raw_spin_lock_irqsave(&vector_lock, flags);
3282 dynamic_irq_init_keep_chip_data(irq); 3077 if (!__assign_irq_vector(irq, cfg, apic->target_cpus()))
3078 ret = irq;
3079 raw_spin_unlock_irqrestore(&vector_lock, flags);
3283 3080
3284 return irq; 3081 if (ret) {
3082 set_irq_chip_data(irq, cfg);
3083 irq_clear_status_flags(irq, IRQ_NOREQUEST);
3084 } else {
3085 free_irq_at(irq, cfg);
3086 }
3087 return ret;
3285} 3088}
3286 3089
3287int create_irq(void) 3090int create_irq(void)
3288{ 3091{
3289 int node = cpu_to_node(boot_cpu_id); 3092 int node = cpu_to_node(0);
3290 unsigned int irq_want; 3093 unsigned int irq_want;
3291 int irq; 3094 int irq;
3292 3095
@@ -3301,14 +3104,17 @@ int create_irq(void)
3301 3104
3302void destroy_irq(unsigned int irq) 3105void destroy_irq(unsigned int irq)
3303{ 3106{
3107 struct irq_cfg *cfg = get_irq_chip_data(irq);
3304 unsigned long flags; 3108 unsigned long flags;
3305 3109
3306 dynamic_irq_cleanup_keep_chip_data(irq); 3110 irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE);
3307 3111
3308 free_irte(irq); 3112 if (intr_remapping_enabled)
3113 free_irte(irq);
3309 raw_spin_lock_irqsave(&vector_lock, flags); 3114 raw_spin_lock_irqsave(&vector_lock, flags);
3310 __clear_irq_vector(irq, get_irq_chip_data(irq)); 3115 __clear_irq_vector(irq, cfg);
3311 raw_spin_unlock_irqrestore(&vector_lock, flags); 3116 raw_spin_unlock_irqrestore(&vector_lock, flags);
3117 free_irq_at(irq, cfg);
3312} 3118}
3313 3119
3314/* 3120/*
@@ -3332,7 +3138,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
3332 3138
3333 dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus()); 3139 dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
3334 3140
3335 if (irq_remapped(irq)) { 3141 if (irq_remapped(get_irq_chip_data(irq))) {
3336 struct irte irte; 3142 struct irte irte;
3337 int ir_index; 3143 int ir_index;
3338 u16 sub_handle; 3144 u16 sub_handle;
@@ -3340,14 +3146,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
3340 ir_index = map_irq_to_irte_handle(irq, &sub_handle); 3146 ir_index = map_irq_to_irte_handle(irq, &sub_handle);
3341 BUG_ON(ir_index == -1); 3147 BUG_ON(ir_index == -1);
3342 3148
3343 memset (&irte, 0, sizeof(irte)); 3149 prepare_irte(&irte, cfg->vector, dest);
3344
3345 irte.present = 1;
3346 irte.dst_mode = apic->irq_dest_mode;
3347 irte.trigger_mode = 0; /* edge */
3348 irte.dlvry_mode = apic->irq_delivery_mode;
3349 irte.vector = cfg->vector;
3350 irte.dest_id = IRTE_DEST(dest);
3351 3150
3352 /* Set source-id of interrupt request */ 3151 /* Set source-id of interrupt request */
3353 if (pdev) 3152 if (pdev)
@@ -3392,26 +3191,24 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
3392} 3191}
3393 3192
3394#ifdef CONFIG_SMP 3193#ifdef CONFIG_SMP
3395static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) 3194static int
3195msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
3396{ 3196{
3397 struct irq_desc *desc = irq_to_desc(irq); 3197 struct irq_cfg *cfg = data->chip_data;
3398 struct irq_cfg *cfg;
3399 struct msi_msg msg; 3198 struct msi_msg msg;
3400 unsigned int dest; 3199 unsigned int dest;
3401 3200
3402 if (set_desc_affinity(desc, mask, &dest)) 3201 if (__ioapic_set_affinity(data, mask, &dest))
3403 return -1; 3202 return -1;
3404 3203
3405 cfg = desc->chip_data; 3204 __get_cached_msi_msg(data->msi_desc, &msg);
3406
3407 get_cached_msi_msg_desc(desc, &msg);
3408 3205
3409 msg.data &= ~MSI_DATA_VECTOR_MASK; 3206 msg.data &= ~MSI_DATA_VECTOR_MASK;
3410 msg.data |= MSI_DATA_VECTOR(cfg->vector); 3207 msg.data |= MSI_DATA_VECTOR(cfg->vector);
3411 msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; 3208 msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
3412 msg.address_lo |= MSI_ADDR_DEST_ID(dest); 3209 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
3413 3210
3414 write_msi_msg_desc(desc, &msg); 3211 __write_msi_msg(data->msi_desc, &msg);
3415 3212
3416 return 0; 3213 return 0;
3417} 3214}
@@ -3421,17 +3218,17 @@ static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3421 * done in the process context using interrupt-remapping hardware. 3218 * done in the process context using interrupt-remapping hardware.
3422 */ 3219 */
3423static int 3220static int
3424ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) 3221ir_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
3222 bool force)
3425{ 3223{
3426 struct irq_desc *desc = irq_to_desc(irq); 3224 struct irq_cfg *cfg = data->chip_data;
3427 struct irq_cfg *cfg = desc->chip_data; 3225 unsigned int dest, irq = data->irq;
3428 unsigned int dest;
3429 struct irte irte; 3226 struct irte irte;
3430 3227
3431 if (get_irte(irq, &irte)) 3228 if (get_irte(irq, &irte))
3432 return -1; 3229 return -1;
3433 3230
3434 if (set_desc_affinity(desc, mask, &dest)) 3231 if (__ioapic_set_affinity(data, mask, &dest))
3435 return -1; 3232 return -1;
3436 3233
3437 irte.vector = cfg->vector; 3234 irte.vector = cfg->vector;
@@ -3461,27 +3258,27 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3461 * which implement the MSI or MSI-X Capability Structure. 3258 * which implement the MSI or MSI-X Capability Structure.
3462 */ 3259 */
3463static struct irq_chip msi_chip = { 3260static struct irq_chip msi_chip = {
3464 .name = "PCI-MSI", 3261 .name = "PCI-MSI",
3465 .unmask = unmask_msi_irq, 3262 .irq_unmask = unmask_msi_irq,
3466 .mask = mask_msi_irq, 3263 .irq_mask = mask_msi_irq,
3467 .ack = ack_apic_edge, 3264 .irq_ack = ack_apic_edge,
3468#ifdef CONFIG_SMP 3265#ifdef CONFIG_SMP
3469 .set_affinity = set_msi_irq_affinity, 3266 .irq_set_affinity = msi_set_affinity,
3470#endif 3267#endif
3471 .retrigger = ioapic_retrigger_irq, 3268 .irq_retrigger = ioapic_retrigger_irq,
3472}; 3269};
3473 3270
3474static struct irq_chip msi_ir_chip = { 3271static struct irq_chip msi_ir_chip = {
3475 .name = "IR-PCI-MSI", 3272 .name = "IR-PCI-MSI",
3476 .unmask = unmask_msi_irq, 3273 .irq_unmask = unmask_msi_irq,
3477 .mask = mask_msi_irq, 3274 .irq_mask = mask_msi_irq,
3478#ifdef CONFIG_INTR_REMAP 3275#ifdef CONFIG_INTR_REMAP
3479 .ack = ir_ack_apic_edge, 3276 .irq_ack = ir_ack_apic_edge,
3480#ifdef CONFIG_SMP 3277#ifdef CONFIG_SMP
3481 .set_affinity = ir_set_msi_irq_affinity, 3278 .irq_set_affinity = ir_msi_set_affinity,
3482#endif 3279#endif
3483#endif 3280#endif
3484 .retrigger = ioapic_retrigger_irq, 3281 .irq_retrigger = ioapic_retrigger_irq,
3485}; 3282};
3486 3283
3487/* 3284/*
@@ -3513,8 +3310,8 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
3513 3310
3514static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) 3311static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
3515{ 3312{
3516 int ret;
3517 struct msi_msg msg; 3313 struct msi_msg msg;
3314 int ret;
3518 3315
3519 ret = msi_compose_msg(dev, irq, &msg, -1); 3316 ret = msi_compose_msg(dev, irq, &msg, -1);
3520 if (ret < 0) 3317 if (ret < 0)
@@ -3523,12 +3320,8 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
3523 set_irq_msi(irq, msidesc); 3320 set_irq_msi(irq, msidesc);
3524 write_msi_msg(irq, &msg); 3321 write_msi_msg(irq, &msg);
3525 3322
3526 if (irq_remapped(irq)) { 3323 if (irq_remapped(get_irq_chip_data(irq))) {
3527 struct irq_desc *desc = irq_to_desc(irq); 3324 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
3528 /*
3529 * irq migration in process context
3530 */
3531 desc->status |= IRQ_MOVE_PCNTXT;
3532 set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge"); 3325 set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge");
3533 } else 3326 } else
3534 set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); 3327 set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
@@ -3540,13 +3333,10 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
3540 3333
3541int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) 3334int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
3542{ 3335{
3543 unsigned int irq; 3336 int node, ret, sub_handle, index = 0;
3544 int ret, sub_handle; 3337 unsigned int irq, irq_want;
3545 struct msi_desc *msidesc; 3338 struct msi_desc *msidesc;
3546 unsigned int irq_want;
3547 struct intel_iommu *iommu = NULL; 3339 struct intel_iommu *iommu = NULL;
3548 int index = 0;
3549 int node;
3550 3340
3551 /* x86 doesn't support multiple MSI yet */ 3341 /* x86 doesn't support multiple MSI yet */
3552 if (type == PCI_CAP_ID_MSI && nvec > 1) 3342 if (type == PCI_CAP_ID_MSI && nvec > 1)
@@ -3606,18 +3396,17 @@ void arch_teardown_msi_irq(unsigned int irq)
3606 3396
3607#if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP) 3397#if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP)
3608#ifdef CONFIG_SMP 3398#ifdef CONFIG_SMP
3609static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) 3399static int
3400dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
3401 bool force)
3610{ 3402{
3611 struct irq_desc *desc = irq_to_desc(irq); 3403 struct irq_cfg *cfg = data->chip_data;
3612 struct irq_cfg *cfg; 3404 unsigned int dest, irq = data->irq;
3613 struct msi_msg msg; 3405 struct msi_msg msg;
3614 unsigned int dest;
3615 3406
3616 if (set_desc_affinity(desc, mask, &dest)) 3407 if (__ioapic_set_affinity(data, mask, &dest))
3617 return -1; 3408 return -1;
3618 3409
3619 cfg = desc->chip_data;
3620
3621 dmar_msi_read(irq, &msg); 3410 dmar_msi_read(irq, &msg);
3622 3411
3623 msg.data &= ~MSI_DATA_VECTOR_MASK; 3412 msg.data &= ~MSI_DATA_VECTOR_MASK;
@@ -3633,14 +3422,14 @@ static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3633#endif /* CONFIG_SMP */ 3422#endif /* CONFIG_SMP */
3634 3423
3635static struct irq_chip dmar_msi_type = { 3424static struct irq_chip dmar_msi_type = {
3636 .name = "DMAR_MSI", 3425 .name = "DMAR_MSI",
3637 .unmask = dmar_msi_unmask, 3426 .irq_unmask = dmar_msi_unmask,
3638 .mask = dmar_msi_mask, 3427 .irq_mask = dmar_msi_mask,
3639 .ack = ack_apic_edge, 3428 .irq_ack = ack_apic_edge,
3640#ifdef CONFIG_SMP 3429#ifdef CONFIG_SMP
3641 .set_affinity = dmar_msi_set_affinity, 3430 .irq_set_affinity = dmar_msi_set_affinity,
3642#endif 3431#endif
3643 .retrigger = ioapic_retrigger_irq, 3432 .irq_retrigger = ioapic_retrigger_irq,
3644}; 3433};
3645 3434
3646int arch_setup_dmar_msi(unsigned int irq) 3435int arch_setup_dmar_msi(unsigned int irq)
@@ -3661,26 +3450,24 @@ int arch_setup_dmar_msi(unsigned int irq)
3661#ifdef CONFIG_HPET_TIMER 3450#ifdef CONFIG_HPET_TIMER
3662 3451
3663#ifdef CONFIG_SMP 3452#ifdef CONFIG_SMP
3664static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) 3453static int hpet_msi_set_affinity(struct irq_data *data,
3454 const struct cpumask *mask, bool force)
3665{ 3455{
3666 struct irq_desc *desc = irq_to_desc(irq); 3456 struct irq_cfg *cfg = data->chip_data;
3667 struct irq_cfg *cfg;
3668 struct msi_msg msg; 3457 struct msi_msg msg;
3669 unsigned int dest; 3458 unsigned int dest;
3670 3459
3671 if (set_desc_affinity(desc, mask, &dest)) 3460 if (__ioapic_set_affinity(data, mask, &dest))
3672 return -1; 3461 return -1;
3673 3462
3674 cfg = desc->chip_data; 3463 hpet_msi_read(data->handler_data, &msg);
3675
3676 hpet_msi_read(irq, &msg);
3677 3464
3678 msg.data &= ~MSI_DATA_VECTOR_MASK; 3465 msg.data &= ~MSI_DATA_VECTOR_MASK;
3679 msg.data |= MSI_DATA_VECTOR(cfg->vector); 3466 msg.data |= MSI_DATA_VECTOR(cfg->vector);
3680 msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; 3467 msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
3681 msg.address_lo |= MSI_ADDR_DEST_ID(dest); 3468 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
3682 3469
3683 hpet_msi_write(irq, &msg); 3470 hpet_msi_write(data->handler_data, &msg);
3684 3471
3685 return 0; 3472 return 0;
3686} 3473}
@@ -3688,34 +3475,33 @@ static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3688#endif /* CONFIG_SMP */ 3475#endif /* CONFIG_SMP */
3689 3476
3690static struct irq_chip ir_hpet_msi_type = { 3477static struct irq_chip ir_hpet_msi_type = {
3691 .name = "IR-HPET_MSI", 3478 .name = "IR-HPET_MSI",
3692 .unmask = hpet_msi_unmask, 3479 .irq_unmask = hpet_msi_unmask,
3693 .mask = hpet_msi_mask, 3480 .irq_mask = hpet_msi_mask,
3694#ifdef CONFIG_INTR_REMAP 3481#ifdef CONFIG_INTR_REMAP
3695 .ack = ir_ack_apic_edge, 3482 .irq_ack = ir_ack_apic_edge,
3696#ifdef CONFIG_SMP 3483#ifdef CONFIG_SMP
3697 .set_affinity = ir_set_msi_irq_affinity, 3484 .irq_set_affinity = ir_msi_set_affinity,
3698#endif 3485#endif
3699#endif 3486#endif
3700 .retrigger = ioapic_retrigger_irq, 3487 .irq_retrigger = ioapic_retrigger_irq,
3701}; 3488};
3702 3489
3703static struct irq_chip hpet_msi_type = { 3490static struct irq_chip hpet_msi_type = {
3704 .name = "HPET_MSI", 3491 .name = "HPET_MSI",
3705 .unmask = hpet_msi_unmask, 3492 .irq_unmask = hpet_msi_unmask,
3706 .mask = hpet_msi_mask, 3493 .irq_mask = hpet_msi_mask,
3707 .ack = ack_apic_edge, 3494 .irq_ack = ack_apic_edge,
3708#ifdef CONFIG_SMP 3495#ifdef CONFIG_SMP
3709 .set_affinity = hpet_msi_set_affinity, 3496 .irq_set_affinity = hpet_msi_set_affinity,
3710#endif 3497#endif
3711 .retrigger = ioapic_retrigger_irq, 3498 .irq_retrigger = ioapic_retrigger_irq,
3712}; 3499};
3713 3500
3714int arch_setup_hpet_msi(unsigned int irq, unsigned int id) 3501int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
3715{ 3502{
3716 int ret;
3717 struct msi_msg msg; 3503 struct msi_msg msg;
3718 struct irq_desc *desc = irq_to_desc(irq); 3504 int ret;
3719 3505
3720 if (intr_remapping_enabled) { 3506 if (intr_remapping_enabled) {
3721 struct intel_iommu *iommu = map_hpet_to_ir(id); 3507 struct intel_iommu *iommu = map_hpet_to_ir(id);
@@ -3733,9 +3519,9 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
3733 if (ret < 0) 3519 if (ret < 0)
3734 return ret; 3520 return ret;
3735 3521
3736 hpet_msi_write(irq, &msg); 3522 hpet_msi_write(get_irq_data(irq), &msg);
3737 desc->status |= IRQ_MOVE_PCNTXT; 3523 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
3738 if (irq_remapped(irq)) 3524 if (irq_remapped(get_irq_chip_data(irq)))
3739 set_irq_chip_and_handler_name(irq, &ir_hpet_msi_type, 3525 set_irq_chip_and_handler_name(irq, &ir_hpet_msi_type,
3740 handle_edge_irq, "edge"); 3526 handle_edge_irq, "edge");
3741 else 3527 else
@@ -3768,33 +3554,30 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
3768 write_ht_irq_msg(irq, &msg); 3554 write_ht_irq_msg(irq, &msg);
3769} 3555}
3770 3556
3771static int set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask) 3557static int
3558ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
3772{ 3559{
3773 struct irq_desc *desc = irq_to_desc(irq); 3560 struct irq_cfg *cfg = data->chip_data;
3774 struct irq_cfg *cfg;
3775 unsigned int dest; 3561 unsigned int dest;
3776 3562
3777 if (set_desc_affinity(desc, mask, &dest)) 3563 if (__ioapic_set_affinity(data, mask, &dest))
3778 return -1; 3564 return -1;
3779 3565
3780 cfg = desc->chip_data; 3566 target_ht_irq(data->irq, dest, cfg->vector);
3781
3782 target_ht_irq(irq, dest, cfg->vector);
3783
3784 return 0; 3567 return 0;
3785} 3568}
3786 3569
3787#endif 3570#endif
3788 3571
3789static struct irq_chip ht_irq_chip = { 3572static struct irq_chip ht_irq_chip = {
3790 .name = "PCI-HT", 3573 .name = "PCI-HT",
3791 .mask = mask_ht_irq, 3574 .irq_mask = mask_ht_irq,
3792 .unmask = unmask_ht_irq, 3575 .irq_unmask = unmask_ht_irq,
3793 .ack = ack_apic_edge, 3576 .irq_ack = ack_apic_edge,
3794#ifdef CONFIG_SMP 3577#ifdef CONFIG_SMP
3795 .set_affinity = set_ht_irq_affinity, 3578 .irq_set_affinity = ht_set_affinity,
3796#endif 3579#endif
3797 .retrigger = ioapic_retrigger_irq, 3580 .irq_retrigger = ioapic_retrigger_irq,
3798}; 3581};
3799 3582
3800int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) 3583int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
@@ -3885,14 +3668,13 @@ int __init arch_probe_nr_irqs(void)
3885 if (nr < nr_irqs) 3668 if (nr < nr_irqs)
3886 nr_irqs = nr; 3669 nr_irqs = nr;
3887 3670
3888 return 0; 3671 return NR_IRQS_LEGACY;
3889} 3672}
3890#endif 3673#endif
3891 3674
3892static int __io_apic_set_pci_routing(struct device *dev, int irq, 3675static int __io_apic_set_pci_routing(struct device *dev, int irq,
3893 struct io_apic_irq_attr *irq_attr) 3676 struct io_apic_irq_attr *irq_attr)
3894{ 3677{
3895 struct irq_desc *desc;
3896 struct irq_cfg *cfg; 3678 struct irq_cfg *cfg;
3897 int node; 3679 int node;
3898 int ioapic, pin; 3680 int ioapic, pin;
@@ -3908,13 +3690,11 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq,
3908 if (dev) 3690 if (dev)
3909 node = dev_to_node(dev); 3691 node = dev_to_node(dev);
3910 else 3692 else
3911 node = cpu_to_node(boot_cpu_id); 3693 node = cpu_to_node(0);
3912 3694
3913 desc = irq_to_desc_alloc_node(irq, node); 3695 cfg = alloc_irq_and_cfg_at(irq, node);
3914 if (!desc) { 3696 if (!cfg)
3915 printk(KERN_INFO "can not get irq_desc %d\n", irq);
3916 return 0; 3697 return 0;
3917 }
3918 3698
3919 pin = irq_attr->ioapic_pin; 3699 pin = irq_attr->ioapic_pin;
3920 trigger = irq_attr->trigger; 3700 trigger = irq_attr->trigger;
@@ -3924,15 +3704,14 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq,
3924 * IRQs < 16 are already in the irq_2_pin[] map 3704 * IRQs < 16 are already in the irq_2_pin[] map
3925 */ 3705 */
3926 if (irq >= legacy_pic->nr_legacy_irqs) { 3706 if (irq >= legacy_pic->nr_legacy_irqs) {
3927 cfg = desc->chip_data; 3707 if (__add_pin_to_irq_node(cfg, node, ioapic, pin)) {
3928 if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) {
3929 printk(KERN_INFO "can not add pin %d for irq %d\n", 3708 printk(KERN_INFO "can not add pin %d for irq %d\n",
3930 pin, irq); 3709 pin, irq);
3931 return 0; 3710 return 0;
3932 } 3711 }
3933 } 3712 }
3934 3713
3935 setup_IO_APIC_irq(ioapic, pin, irq, desc, trigger, polarity); 3714 setup_ioapic_irq(ioapic, pin, irq, cfg, trigger, polarity);
3936 3715
3937 return 0; 3716 return 0;
3938} 3717}
@@ -4125,14 +3904,14 @@ void __init setup_ioapic_dest(void)
4125 */ 3904 */
4126 if (desc->status & 3905 if (desc->status &
4127 (IRQ_NO_BALANCING | IRQ_AFFINITY_SET)) 3906 (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
4128 mask = desc->affinity; 3907 mask = desc->irq_data.affinity;
4129 else 3908 else
4130 mask = apic->target_cpus(); 3909 mask = apic->target_cpus();
4131 3910
4132 if (intr_remapping_enabled) 3911 if (intr_remapping_enabled)
4133 set_ir_ioapic_affinity_irq_desc(desc, mask); 3912 ir_ioapic_set_affinity(&desc->irq_data, mask, false);
4134 else 3913 else
4135 set_ioapic_affinity_irq_desc(desc, mask); 3914 ioapic_set_affinity(&desc->irq_data, mask, false);
4136 } 3915 }
4137 3916
4138} 3917}
@@ -4316,19 +4095,18 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
4316void __init pre_init_apic_IRQ0(void) 4095void __init pre_init_apic_IRQ0(void)
4317{ 4096{
4318 struct irq_cfg *cfg; 4097 struct irq_cfg *cfg;
4319 struct irq_desc *desc;
4320 4098
4321 printk(KERN_INFO "Early APIC setup for system timer0\n"); 4099 printk(KERN_INFO "Early APIC setup for system timer0\n");
4322#ifndef CONFIG_SMP 4100#ifndef CONFIG_SMP
4323 phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid); 4101 phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid);
4324#endif 4102#endif
4325 desc = irq_to_desc_alloc_node(0, 0); 4103 /* Make sure the irq descriptor is set up */
4104 cfg = alloc_irq_and_cfg_at(0, 0);
4326 4105
4327 setup_local_APIC(); 4106 setup_local_APIC();
4328 4107
4329 cfg = irq_cfg(0);
4330 add_pin_to_irq_node(cfg, 0, 0, 0); 4108 add_pin_to_irq_node(cfg, 0, 0, 0);
4331 set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); 4109 set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge");
4332 4110
4333 setup_IO_APIC_irq(0, 0, 0, desc, 0, 0); 4111 setup_ioapic_irq(0, 0, 0, cfg, 0, 0);
4334} 4112}
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index a43f71cb30f8..c90041ccb742 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -178,7 +178,7 @@ int __init check_nmi_watchdog(void)
178error: 178error:
179 if (nmi_watchdog == NMI_IO_APIC) { 179 if (nmi_watchdog == NMI_IO_APIC) {
180 if (!timer_through_8259) 180 if (!timer_through_8259)
181 legacy_pic->chip->mask(0); 181 legacy_pic->mask(0);
182 on_each_cpu(__acpi_nmi_disable, NULL, 1); 182 on_each_cpu(__acpi_nmi_disable, NULL, 1);
183 } 183 }
184 184
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index 83e9be4778e2..f9e4e6a54073 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -54,6 +54,9 @@ static int apicid_phys_pkg_id(int initial_apic_id, int index_msb)
54 */ 54 */
55void __init default_setup_apic_routing(void) 55void __init default_setup_apic_routing(void)
56{ 56{
57
58 enable_IR_x2apic();
59
57#ifdef CONFIG_X86_X2APIC 60#ifdef CONFIG_X86_X2APIC
58 if (x2apic_mode 61 if (x2apic_mode
59#ifdef CONFIG_X86_UV 62#ifdef CONFIG_X86_UV
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index ba5f62f45f01..9e093f8fe78c 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -148,7 +148,7 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
148{ 148{
149#ifdef CONFIG_SMP 149#ifdef CONFIG_SMP
150 /* calling is from identify_secondary_cpu() ? */ 150 /* calling is from identify_secondary_cpu() ? */
151 if (c->cpu_index == boot_cpu_id) 151 if (!c->cpu_index)
152 return; 152 return;
153 153
154 /* 154 /*
@@ -253,37 +253,51 @@ static int __cpuinit nearby_node(int apicid)
253#endif 253#endif
254 254
255/* 255/*
256 * Fixup core topology information for AMD multi-node processors. 256 * Fixup core topology information for
257 * Assumption: Number of cores in each internal node is the same. 257 * (1) AMD multi-node processors
258 * Assumption: Number of cores in each internal node is the same.
259 * (2) AMD processors supporting compute units
258 */ 260 */
259#ifdef CONFIG_X86_HT 261#ifdef CONFIG_X86_HT
260static void __cpuinit amd_fixup_dcm(struct cpuinfo_x86 *c) 262static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c)
261{ 263{
262 unsigned long long value; 264 u32 nodes;
263 u32 nodes, cores_per_node; 265 u8 node_id;
264 int cpu = smp_processor_id(); 266 int cpu = smp_processor_id();
265 267
266 if (!cpu_has(c, X86_FEATURE_NODEID_MSR)) 268 /* get information required for multi-node processors */
267 return; 269 if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
270 u32 eax, ebx, ecx, edx;
268 271
269 /* fixup topology information only once for a core */ 272 cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
270 if (cpu_has(c, X86_FEATURE_AMD_DCM)) 273 nodes = ((ecx >> 8) & 7) + 1;
271 return; 274 node_id = ecx & 7;
272 275
273 rdmsrl(MSR_FAM10H_NODE_ID, value); 276 /* get compute unit information */
277 smp_num_siblings = ((ebx >> 8) & 3) + 1;
278 c->compute_unit_id = ebx & 0xff;
279 } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) {
280 u64 value;
274 281
275 nodes = ((value >> 3) & 7) + 1; 282 rdmsrl(MSR_FAM10H_NODE_ID, value);
276 if (nodes == 1) 283 nodes = ((value >> 3) & 7) + 1;
284 node_id = value & 7;
285 } else
277 return; 286 return;
278 287
279 set_cpu_cap(c, X86_FEATURE_AMD_DCM); 288 /* fixup multi-node processor information */
280 cores_per_node = c->x86_max_cores / nodes; 289 if (nodes > 1) {
290 u32 cores_per_node;
291
292 set_cpu_cap(c, X86_FEATURE_AMD_DCM);
293 cores_per_node = c->x86_max_cores / nodes;
281 294
282 /* store NodeID, use llc_shared_map to store sibling info */ 295 /* store NodeID, use llc_shared_map to store sibling info */
283 per_cpu(cpu_llc_id, cpu) = value & 7; 296 per_cpu(cpu_llc_id, cpu) = node_id;
284 297
285 /* fixup core id to be in range from 0 to (cores_per_node - 1) */ 298 /* core id to be in range from 0 to (cores_per_node - 1) */
286 c->cpu_core_id = c->cpu_core_id % cores_per_node; 299 c->cpu_core_id = c->cpu_core_id % cores_per_node;
300 }
287} 301}
288#endif 302#endif
289 303
@@ -304,9 +318,7 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
304 c->phys_proc_id = c->initial_apicid >> bits; 318 c->phys_proc_id = c->initial_apicid >> bits;
305 /* use socket ID also for last level cache */ 319 /* use socket ID also for last level cache */
306 per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; 320 per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
307 /* fixup topology information on multi-node processors */ 321 amd_get_topology(c);
308 if ((c->x86 == 0x10) && (c->x86_model == 9))
309 amd_fixup_dcm(c);
310#endif 322#endif
311} 323}
312 324
@@ -412,6 +424,23 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
412 set_cpu_cap(c, X86_FEATURE_EXTD_APICID); 424 set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
413 } 425 }
414#endif 426#endif
427
428 /* We need to do the following only once */
429 if (c != &boot_cpu_data)
430 return;
431
432 if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) {
433
434 if (c->x86 > 0x10 ||
435 (c->x86 == 0x10 && c->x86_model >= 0x2)) {
436 u64 val;
437
438 rdmsrl(MSR_K7_HWCR, val);
439 if (!(val & BIT(24)))
440 printk(KERN_WARNING FW_BUG "TSC doesn't count "
441 "with P0 frequency!\n");
442 }
443 }
415} 444}
416 445
417static void __cpuinit init_amd(struct cpuinfo_x86 *c) 446static void __cpuinit init_amd(struct cpuinfo_x86 *c)
@@ -523,7 +552,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
523#endif 552#endif
524 553
525 if (c->extended_cpuid_level >= 0x80000006) { 554 if (c->extended_cpuid_level >= 0x80000006) {
526 if ((c->x86 >= 0x0f) && (cpuid_edx(0x80000006) & 0xf000)) 555 if (cpuid_edx(0x80000006) & 0xf000)
527 num_cache_leaves = 4; 556 num_cache_leaves = 4;
528 else 557 else
529 num_cache_leaves = 3; 558 num_cache_leaves = 3;
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index f2f9ac7da25c..4b68bda30938 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -665,7 +665,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
665 this_cpu->c_early_init(c); 665 this_cpu->c_early_init(c);
666 666
667#ifdef CONFIG_SMP 667#ifdef CONFIG_SMP
668 c->cpu_index = boot_cpu_id; 668 c->cpu_index = 0;
669#endif 669#endif
670 filter_cpuid_features(c, false); 670 filter_cpuid_features(c, false);
671} 671}
@@ -704,16 +704,21 @@ void __init early_cpu_init(void)
704} 704}
705 705
706/* 706/*
707 * The NOPL instruction is supposed to exist on all CPUs with 707 * The NOPL instruction is supposed to exist on all CPUs of family >= 6;
708 * family >= 6; unfortunately, that's not true in practice because 708 * unfortunately, that's not true in practice because of early VIA
709 * of early VIA chips and (more importantly) broken virtualizers that 709 * chips and (more importantly) broken virtualizers that are not easy
710 * are not easy to detect. In the latter case it doesn't even *fail* 710 * to detect. In the latter case it doesn't even *fail* reliably, so
711 * reliably, so probing for it doesn't even work. Disable it completely 711 * probing for it doesn't even work. Disable it completely on 32-bit
712 * unless we can find a reliable way to detect all the broken cases. 712 * unless we can find a reliable way to detect all the broken cases.
713 * Enable it explicitly on 64-bit for non-constant inputs of cpu_has().
713 */ 714 */
714static void __cpuinit detect_nopl(struct cpuinfo_x86 *c) 715static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
715{ 716{
717#ifdef CONFIG_X86_32
716 clear_cpu_cap(c, X86_FEATURE_NOPL); 718 clear_cpu_cap(c, X86_FEATURE_NOPL);
719#else
720 set_cpu_cap(c, X86_FEATURE_NOPL);
721#endif
717} 722}
718 723
719static void __cpuinit generic_identify(struct cpuinfo_x86 *c) 724static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
@@ -1264,13 +1269,6 @@ void __cpuinit cpu_init(void)
1264 clear_all_debug_regs(); 1269 clear_all_debug_regs();
1265 dbg_restore_debug_regs(); 1270 dbg_restore_debug_regs();
1266 1271
1267 /*
1268 * Force FPU initialization:
1269 */
1270 current_thread_info()->status = 0;
1271 clear_used_math();
1272 mxcsr_feature_mask_init();
1273
1274 fpu_init(); 1272 fpu_init();
1275 xsave_init(); 1273 xsave_init();
1276} 1274}
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index f668bb1f7d43..e765633f210e 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -32,6 +32,7 @@ struct cpu_dev {
32extern const struct cpu_dev *const __x86_cpu_dev_start[], 32extern const struct cpu_dev *const __x86_cpu_dev_start[],
33 *const __x86_cpu_dev_end[]; 33 *const __x86_cpu_dev_end[];
34 34
35extern void get_cpu_cap(struct cpuinfo_x86 *c);
35extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); 36extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
36extern void get_cpu_cap(struct cpuinfo_x86 *c); 37extern void get_cpu_cap(struct cpuinfo_x86 *c);
37 38
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index b4389441efbb..695f17731e23 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -170,7 +170,7 @@ static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c)
170{ 170{
171#ifdef CONFIG_SMP 171#ifdef CONFIG_SMP
172 /* calling is from identify_secondary_cpu() ? */ 172 /* calling is from identify_secondary_cpu() ? */
173 if (c->cpu_index == boot_cpu_id) 173 if (!c->cpu_index)
174 return; 174 return;
175 175
176 /* 176 /*
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 898c2f4eab88..12cd823c8d03 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -17,7 +17,7 @@
17 17
18#include <asm/processor.h> 18#include <asm/processor.h>
19#include <linux/smp.h> 19#include <linux/smp.h>
20#include <asm/k8.h> 20#include <asm/amd_nb.h>
21#include <asm/smp.h> 21#include <asm/smp.h>
22 22
23#define LVL_1_INST 1 23#define LVL_1_INST 1
@@ -306,7 +306,7 @@ struct _cache_attr {
306 ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count); 306 ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count);
307}; 307};
308 308
309#ifdef CONFIG_CPU_SUP_AMD 309#ifdef CONFIG_AMD_NB
310 310
311/* 311/*
312 * L3 cache descriptors 312 * L3 cache descriptors
@@ -369,7 +369,7 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
369 return; 369 return;
370 370
371 /* not in virtualized environments */ 371 /* not in virtualized environments */
372 if (num_k8_northbridges == 0) 372 if (k8_northbridges.num == 0)
373 return; 373 return;
374 374
375 /* 375 /*
@@ -377,7 +377,7 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
377 * never freed but this is done only on shutdown so it doesn't matter. 377 * never freed but this is done only on shutdown so it doesn't matter.
378 */ 378 */
379 if (!l3_caches) { 379 if (!l3_caches) {
380 int size = num_k8_northbridges * sizeof(struct amd_l3_cache *); 380 int size = k8_northbridges.num * sizeof(struct amd_l3_cache *);
381 381
382 l3_caches = kzalloc(size, GFP_ATOMIC); 382 l3_caches = kzalloc(size, GFP_ATOMIC);
383 if (!l3_caches) 383 if (!l3_caches)
@@ -556,12 +556,12 @@ static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644,
556static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, 556static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
557 show_cache_disable_1, store_cache_disable_1); 557 show_cache_disable_1, store_cache_disable_1);
558 558
559#else /* CONFIG_CPU_SUP_AMD */ 559#else /* CONFIG_AMD_NB */
560static void __cpuinit 560static void __cpuinit
561amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index) 561amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index)
562{ 562{
563}; 563};
564#endif /* CONFIG_CPU_SUP_AMD */ 564#endif /* CONFIG_AMD_NB */
565 565
566static int 566static int
567__cpuinit cpuid4_cache_lookup_regs(int index, 567__cpuinit cpuid4_cache_lookup_regs(int index,
@@ -1000,7 +1000,7 @@ static struct attribute *default_attrs[] = {
1000 1000
1001static struct attribute *default_l3_attrs[] = { 1001static struct attribute *default_l3_attrs[] = {
1002 DEFAULT_SYSFS_CACHE_ATTRS, 1002 DEFAULT_SYSFS_CACHE_ATTRS,
1003#ifdef CONFIG_CPU_SUP_AMD 1003#ifdef CONFIG_AMD_NB
1004 &cache_disable_0.attr, 1004 &cache_disable_0.attr,
1005 &cache_disable_1.attr, 1005 &cache_disable_1.attr,
1006#endif 1006#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 39aaee5c1ab2..80c482382d5c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -131,7 +131,8 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
131 u32 low = 0, high = 0, address = 0; 131 u32 low = 0, high = 0, address = 0;
132 unsigned int bank, block; 132 unsigned int bank, block;
133 struct thresh_restart tr; 133 struct thresh_restart tr;
134 u8 lvt_off; 134 int lvt_off = -1;
135 u8 offset;
135 136
136 for (bank = 0; bank < NR_BANKS; ++bank) { 137 for (bank = 0; bank < NR_BANKS; ++bank) {
137 for (block = 0; block < NR_BLOCKS; ++block) { 138 for (block = 0; block < NR_BLOCKS; ++block) {
@@ -162,8 +163,28 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
162 if (shared_bank[bank] && c->cpu_core_id) 163 if (shared_bank[bank] && c->cpu_core_id)
163 break; 164 break;
164#endif 165#endif
165 lvt_off = setup_APIC_eilvt_mce(THRESHOLD_APIC_VECTOR, 166 offset = (high & MASK_LVTOFF_HI) >> 20;
166 APIC_EILVT_MSG_FIX, 0); 167 if (lvt_off < 0) {
168 if (setup_APIC_eilvt(offset,
169 THRESHOLD_APIC_VECTOR,
170 APIC_EILVT_MSG_FIX, 0)) {
171 pr_err(FW_BUG "cpu %d, failed to "
172 "setup threshold interrupt "
173 "for bank %d, block %d "
174 "(MSR%08X=0x%x%08x)",
175 smp_processor_id(), bank, block,
176 address, high, low);
177 continue;
178 }
179 lvt_off = offset;
180 } else if (lvt_off != offset) {
181 pr_err(FW_BUG "cpu %d, invalid threshold "
182 "interrupt offset %d for bank %d,"
183 "block %d (MSR%08X=0x%x%08x)",
184 smp_processor_id(), lvt_off, bank,
185 block, address, high, low);
186 continue;
187 }
167 188
168 high &= ~MASK_LVTOFF_HI; 189 high &= ~MASK_LVTOFF_HI;
169 high |= lvt_off << 20; 190 high |= lvt_off << 20;
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 169d8804a9f8..4b683267eca5 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -350,7 +350,7 @@ static void intel_thermal_interrupt(void)
350 350
351static void unexpected_thermal_interrupt(void) 351static void unexpected_thermal_interrupt(void)
352{ 352{
353 printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n", 353 printk(KERN_ERR "CPU%d: Unexpected LVT thermal interrupt!\n",
354 smp_processor_id()); 354 smp_processor_id());
355 add_taint(TAINT_MACHINE_CHECK); 355 add_taint(TAINT_MACHINE_CHECK);
356} 356}
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index c5f59d071425..ac140c7be396 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -827,7 +827,7 @@ int __init amd_special_default_mtrr(void)
827 827
828 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) 828 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
829 return 0; 829 return 0;
830 if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11) 830 if (boot_cpu_data.x86 < 0xf)
831 return 0; 831 return 0;
832 /* In case some hypervisor doesn't pass SYSCFG through: */ 832 /* In case some hypervisor doesn't pass SYSCFG through: */
833 if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0) 833 if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 7d28d7d03885..9f27228ceffd 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -64,18 +64,59 @@ static inline void k8_check_syscfg_dram_mod_en(void)
64 } 64 }
65} 65}
66 66
67/* Get the size of contiguous MTRR range */
68static u64 get_mtrr_size(u64 mask)
69{
70 u64 size;
71
72 mask >>= PAGE_SHIFT;
73 mask |= size_or_mask;
74 size = -mask;
75 size <<= PAGE_SHIFT;
76 return size;
77}
78
67/* 79/*
68 * Returns the effective MTRR type for the region 80 * Check and return the effective type for MTRR-MTRR type overlap.
69 * Error returns: 81 * Returns 1 if the effective type is UNCACHEABLE, else returns 0
70 * - 0xFE - when the range is "not entirely covered" by _any_ var range MTRR
71 * - 0xFF - when MTRR is not enabled
72 */ 82 */
73u8 mtrr_type_lookup(u64 start, u64 end) 83static int check_type_overlap(u8 *prev, u8 *curr)
84{
85 if (*prev == MTRR_TYPE_UNCACHABLE || *curr == MTRR_TYPE_UNCACHABLE) {
86 *prev = MTRR_TYPE_UNCACHABLE;
87 *curr = MTRR_TYPE_UNCACHABLE;
88 return 1;
89 }
90
91 if ((*prev == MTRR_TYPE_WRBACK && *curr == MTRR_TYPE_WRTHROUGH) ||
92 (*prev == MTRR_TYPE_WRTHROUGH && *curr == MTRR_TYPE_WRBACK)) {
93 *prev = MTRR_TYPE_WRTHROUGH;
94 *curr = MTRR_TYPE_WRTHROUGH;
95 }
96
97 if (*prev != *curr) {
98 *prev = MTRR_TYPE_UNCACHABLE;
99 *curr = MTRR_TYPE_UNCACHABLE;
100 return 1;
101 }
102
103 return 0;
104}
105
106/*
107 * Error/Semi-error returns:
108 * 0xFF - when MTRR is not enabled
109 * *repeat == 1 implies [start:end] spanned across MTRR range and type returned
110 * corresponds only to [start:*partial_end].
111 * Caller has to lookup again for [*partial_end:end].
112 */
113static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat)
74{ 114{
75 int i; 115 int i;
76 u64 base, mask; 116 u64 base, mask;
77 u8 prev_match, curr_match; 117 u8 prev_match, curr_match;
78 118
119 *repeat = 0;
79 if (!mtrr_state_set) 120 if (!mtrr_state_set)
80 return 0xFF; 121 return 0xFF;
81 122
@@ -126,8 +167,34 @@ u8 mtrr_type_lookup(u64 start, u64 end)
126 167
127 start_state = ((start & mask) == (base & mask)); 168 start_state = ((start & mask) == (base & mask));
128 end_state = ((end & mask) == (base & mask)); 169 end_state = ((end & mask) == (base & mask));
129 if (start_state != end_state) 170
130 return 0xFE; 171 if (start_state != end_state) {
172 /*
173 * We have start:end spanning across an MTRR.
174 * We split the region into
175 * either
176 * (start:mtrr_end) (mtrr_end:end)
177 * or
178 * (start:mtrr_start) (mtrr_start:end)
179 * depending on kind of overlap.
180 * Return the type for first region and a pointer to
181 * the start of second region so that caller will
182 * lookup again on the second region.
183 * Note: This way we handle multiple overlaps as well.
184 */
185 if (start_state)
186 *partial_end = base + get_mtrr_size(mask);
187 else
188 *partial_end = base;
189
190 if (unlikely(*partial_end <= start)) {
191 WARN_ON(1);
192 *partial_end = start + PAGE_SIZE;
193 }
194
195 end = *partial_end - 1; /* end is inclusive */
196 *repeat = 1;
197 }
131 198
132 if ((start & mask) != (base & mask)) 199 if ((start & mask) != (base & mask))
133 continue; 200 continue;
@@ -138,21 +205,8 @@ u8 mtrr_type_lookup(u64 start, u64 end)
138 continue; 205 continue;
139 } 206 }
140 207
141 if (prev_match == MTRR_TYPE_UNCACHABLE || 208 if (check_type_overlap(&prev_match, &curr_match))
142 curr_match == MTRR_TYPE_UNCACHABLE) { 209 return curr_match;
143 return MTRR_TYPE_UNCACHABLE;
144 }
145
146 if ((prev_match == MTRR_TYPE_WRBACK &&
147 curr_match == MTRR_TYPE_WRTHROUGH) ||
148 (prev_match == MTRR_TYPE_WRTHROUGH &&
149 curr_match == MTRR_TYPE_WRBACK)) {
150 prev_match = MTRR_TYPE_WRTHROUGH;
151 curr_match = MTRR_TYPE_WRTHROUGH;
152 }
153
154 if (prev_match != curr_match)
155 return MTRR_TYPE_UNCACHABLE;
156 } 210 }
157 211
158 if (mtrr_tom2) { 212 if (mtrr_tom2) {
@@ -166,6 +220,36 @@ u8 mtrr_type_lookup(u64 start, u64 end)
166 return mtrr_state.def_type; 220 return mtrr_state.def_type;
167} 221}
168 222
223/*
224 * Returns the effective MTRR type for the region
225 * Error return:
226 * 0xFF - when MTRR is not enabled
227 */
228u8 mtrr_type_lookup(u64 start, u64 end)
229{
230 u8 type, prev_type;
231 int repeat;
232 u64 partial_end;
233
234 type = __mtrr_type_lookup(start, end, &partial_end, &repeat);
235
236 /*
237 * Common path is with repeat = 0.
238 * However, we can have cases where [start:end] spans across some
239 * MTRR range. Do repeated lookups for that case here.
240 */
241 while (repeat) {
242 prev_type = type;
243 start = partial_end;
244 type = __mtrr_type_lookup(start, end, &partial_end, &repeat);
245
246 if (check_type_overlap(&prev_type, &type))
247 return type;
248 }
249
250 return type;
251}
252
169/* Get the MSR pair relating to a var range */ 253/* Get the MSR pair relating to a var range */
170static void 254static void
171get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr) 255get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr)
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index fb329e9f8494..d9f4ff8fcd69 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -700,11 +700,10 @@ static void probe_nmi_watchdog(void)
700{ 700{
701 switch (boot_cpu_data.x86_vendor) { 701 switch (boot_cpu_data.x86_vendor) {
702 case X86_VENDOR_AMD: 702 case X86_VENDOR_AMD:
703 if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 && 703 if (boot_cpu_data.x86 == 6 ||
704 boot_cpu_data.x86 != 16 && boot_cpu_data.x86 != 17) 704 (boot_cpu_data.x86 >= 0xf && boot_cpu_data.x86 <= 0x15))
705 return; 705 wd_ops = &k7_wd_ops;
706 wd_ops = &k7_wd_ops; 706 return;
707 break;
708 case X86_VENDOR_INTEL: 707 case X86_VENDOR_INTEL:
709 /* Work around where perfctr1 doesn't have a working enable 708 /* Work around where perfctr1 doesn't have a working enable
710 * bit as described in the following errata: 709 * bit as described in the following errata:
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index d49079515122..c7f64e6f537a 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -44,6 +44,12 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
44 { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 }, 44 { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 },
45 { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 }, 45 { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 },
46 { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a, 0 }, 46 { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a, 0 },
47 { X86_FEATURE_TSCRATEMSR, CR_EDX, 4, 0x8000000a, 0 },
48 { X86_FEATURE_VMCBCLEAN, CR_EDX, 5, 0x8000000a, 0 },
49 { X86_FEATURE_FLUSHBYASID, CR_EDX, 6, 0x8000000a, 0 },
50 { X86_FEATURE_DECODEASSISTS, CR_EDX, 7, 0x8000000a, 0 },
51 { X86_FEATURE_PAUSEFILTER, CR_EDX,10, 0x8000000a, 0 },
52 { X86_FEATURE_PFTHRESHOLD, CR_EDX,12, 0x8000000a, 0 },
47 { 0, 0, 0, 0, 0 } 53 { 0, 0, 0, 0, 0 }
48 }; 54 };
49 55
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index 045b36cada65..994828899e09 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -34,7 +34,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
34 if (!csize) 34 if (!csize)
35 return 0; 35 return 0;
36 36
37 vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE); 37 vaddr = ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE);
38 if (!vaddr) 38 if (!vaddr)
39 return -ENOMEM; 39 return -ENOMEM;
40 40
@@ -46,6 +46,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
46 } else 46 } else
47 memcpy(buf, vaddr + offset, csize); 47 memcpy(buf, vaddr + offset, csize);
48 48
49 set_iounmap_nonlazy();
49 iounmap(vaddr); 50 iounmap(vaddr);
50 return csize; 51 return csize;
51} 52}
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index ebdb85cf2686..76b8cd953dee 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -97,7 +97,6 @@ static void __init nvidia_bugs(int num, int slot, int func)
97} 97}
98 98
99#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC) 99#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
100#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
101static u32 __init ati_ixp4x0_rev(int num, int slot, int func) 100static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
102{ 101{
103 u32 d; 102 u32 d;
@@ -115,7 +114,6 @@ static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
115 d &= 0xff; 114 d &= 0xff;
116 return d; 115 return d;
117} 116}
118#endif
119 117
120static void __init ati_bugs(int num, int slot, int func) 118static void __init ati_bugs(int num, int slot, int func)
121{ 119{
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index fa99bae75ace..4572f25f9325 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -14,6 +14,7 @@
14#include <xen/hvc-console.h> 14#include <xen/hvc-console.h>
15#include <asm/pci-direct.h> 15#include <asm/pci-direct.h>
16#include <asm/fixmap.h> 16#include <asm/fixmap.h>
17#include <asm/mrst.h>
17#include <asm/pgtable.h> 18#include <asm/pgtable.h>
18#include <linux/usb/ehci_def.h> 19#include <linux/usb/ehci_def.h>
19 20
@@ -239,6 +240,18 @@ static int __init setup_early_printk(char *buf)
239 if (!strncmp(buf, "xen", 3)) 240 if (!strncmp(buf, "xen", 3))
240 early_console_register(&xenboot_console, keep); 241 early_console_register(&xenboot_console, keep);
241#endif 242#endif
243#ifdef CONFIG_X86_MRST_EARLY_PRINTK
244 if (!strncmp(buf, "mrst", 4)) {
245 mrst_early_console_init();
246 early_console_register(&early_mrst_console, keep);
247 }
248
249 if (!strncmp(buf, "hsu", 3)) {
250 hsu_early_console_init();
251 early_console_register(&early_hsu_console, keep);
252 }
253
254#endif
242 buf++; 255 buf++;
243 } 256 }
244 return 0; 257 return 0;
diff --git a/arch/x86/kernel/early_printk_mrst.c b/arch/x86/kernel/early_printk_mrst.c
new file mode 100644
index 000000000000..65df603622b2
--- /dev/null
+++ b/arch/x86/kernel/early_printk_mrst.c
@@ -0,0 +1,319 @@
1/*
2 * early_printk_mrst.c - early consoles for Intel MID platforms
3 *
4 * Copyright (c) 2008-2010, Intel Corporation
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; version 2
9 * of the License.
10 */
11
12/*
13 * This file implements two early consoles named mrst and hsu.
14 * mrst is based on Maxim3110 spi-uart device, it exists in both
15 * Moorestown and Medfield platforms, while hsu is based on a High
16 * Speed UART device which only exists in the Medfield platform
17 */
18
19#include <linux/serial_reg.h>
20#include <linux/serial_mfd.h>
21#include <linux/kmsg_dump.h>
22#include <linux/console.h>
23#include <linux/kernel.h>
24#include <linux/delay.h>
25#include <linux/init.h>
26#include <linux/io.h>
27
28#include <asm/fixmap.h>
29#include <asm/pgtable.h>
30#include <asm/mrst.h>
31
32#define MRST_SPI_TIMEOUT 0x200000
33#define MRST_REGBASE_SPI0 0xff128000
34#define MRST_REGBASE_SPI1 0xff128400
35#define MRST_CLK_SPI0_REG 0xff11d86c
36
37/* Bit fields in CTRLR0 */
38#define SPI_DFS_OFFSET 0
39
40#define SPI_FRF_OFFSET 4
41#define SPI_FRF_SPI 0x0
42#define SPI_FRF_SSP 0x1
43#define SPI_FRF_MICROWIRE 0x2
44#define SPI_FRF_RESV 0x3
45
46#define SPI_MODE_OFFSET 6
47#define SPI_SCPH_OFFSET 6
48#define SPI_SCOL_OFFSET 7
49#define SPI_TMOD_OFFSET 8
50#define SPI_TMOD_TR 0x0 /* xmit & recv */
51#define SPI_TMOD_TO 0x1 /* xmit only */
52#define SPI_TMOD_RO 0x2 /* recv only */
53#define SPI_TMOD_EPROMREAD 0x3 /* eeprom read mode */
54
55#define SPI_SLVOE_OFFSET 10
56#define SPI_SRL_OFFSET 11
57#define SPI_CFS_OFFSET 12
58
59/* Bit fields in SR, 7 bits */
60#define SR_MASK 0x7f /* cover 7 bits */
61#define SR_BUSY (1 << 0)
62#define SR_TF_NOT_FULL (1 << 1)
63#define SR_TF_EMPT (1 << 2)
64#define SR_RF_NOT_EMPT (1 << 3)
65#define SR_RF_FULL (1 << 4)
66#define SR_TX_ERR (1 << 5)
67#define SR_DCOL (1 << 6)
68
69struct dw_spi_reg {
70 u32 ctrl0;
71 u32 ctrl1;
72 u32 ssienr;
73 u32 mwcr;
74 u32 ser;
75 u32 baudr;
76 u32 txfltr;
77 u32 rxfltr;
78 u32 txflr;
79 u32 rxflr;
80 u32 sr;
81 u32 imr;
82 u32 isr;
83 u32 risr;
84 u32 txoicr;
85 u32 rxoicr;
86 u32 rxuicr;
87 u32 msticr;
88 u32 icr;
89 u32 dmacr;
90 u32 dmatdlr;
91 u32 dmardlr;
92 u32 idr;
93 u32 version;
94
95 /* Currently operates as 32 bits, though only the low 16 bits matter */
96 u32 dr;
97} __packed;
98
99#define dw_readl(dw, name) __raw_readl(&(dw)->name)
100#define dw_writel(dw, name, val) __raw_writel((val), &(dw)->name)
101
102/* Default use SPI0 register for mrst, we will detect Penwell and use SPI1 */
103static unsigned long mrst_spi_paddr = MRST_REGBASE_SPI0;
104
105static u32 *pclk_spi0;
106/* Always contains an accessable address, start with 0 */
107static struct dw_spi_reg *pspi;
108
109static struct kmsg_dumper dw_dumper;
110static int dumper_registered;
111
112static void dw_kmsg_dump(struct kmsg_dumper *dumper,
113 enum kmsg_dump_reason reason,
114 const char *s1, unsigned long l1,
115 const char *s2, unsigned long l2)
116{
117 int i;
118
119 /* When run to this, we'd better re-init the HW */
120 mrst_early_console_init();
121
122 for (i = 0; i < l1; i++)
123 early_mrst_console.write(&early_mrst_console, s1 + i, 1);
124 for (i = 0; i < l2; i++)
125 early_mrst_console.write(&early_mrst_console, s2 + i, 1);
126}
127
128/* Set the ratio rate to 115200, 8n1, IRQ disabled */
129static void max3110_write_config(void)
130{
131 u16 config;
132
133 config = 0xc001;
134 dw_writel(pspi, dr, config);
135}
136
137/* Translate char to a eligible word and send to max3110 */
138static void max3110_write_data(char c)
139{
140 u16 data;
141
142 data = 0x8000 | c;
143 dw_writel(pspi, dr, data);
144}
145
146void mrst_early_console_init(void)
147{
148 u32 ctrlr0 = 0;
149 u32 spi0_cdiv;
150 u32 freq; /* Freqency info only need be searched once */
151
152 /* Base clk is 100 MHz, the actual clk = 100M / (clk_divider + 1) */
153 pclk_spi0 = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
154 MRST_CLK_SPI0_REG);
155 spi0_cdiv = ((*pclk_spi0) & 0xe00) >> 9;
156 freq = 100000000 / (spi0_cdiv + 1);
157
158 if (mrst_identify_cpu() == MRST_CPU_CHIP_PENWELL)
159 mrst_spi_paddr = MRST_REGBASE_SPI1;
160
161 pspi = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
162 mrst_spi_paddr);
163
164 /* Disable SPI controller */
165 dw_writel(pspi, ssienr, 0);
166
167 /* Set control param, 8 bits, transmit only mode */
168 ctrlr0 = dw_readl(pspi, ctrl0);
169
170 ctrlr0 &= 0xfcc0;
171 ctrlr0 |= 0xf | (SPI_FRF_SPI << SPI_FRF_OFFSET)
172 | (SPI_TMOD_TO << SPI_TMOD_OFFSET);
173 dw_writel(pspi, ctrl0, ctrlr0);
174
175 /*
176 * Change the spi0 clk to comply with 115200 bps, use 100000 to
177 * calculate the clk dividor to make the clock a little slower
178 * than real baud rate.
179 */
180 dw_writel(pspi, baudr, freq/100000);
181
182 /* Disable all INT for early phase */
183 dw_writel(pspi, imr, 0x0);
184
185 /* Set the cs to spi-uart */
186 dw_writel(pspi, ser, 0x2);
187
188 /* Enable the HW, the last step for HW init */
189 dw_writel(pspi, ssienr, 0x1);
190
191 /* Set the default configuration */
192 max3110_write_config();
193
194 /* Register the kmsg dumper */
195 if (!dumper_registered) {
196 dw_dumper.dump = dw_kmsg_dump;
197 kmsg_dump_register(&dw_dumper);
198 dumper_registered = 1;
199 }
200}
201
202/* Slave select should be called in the read/write function */
203static void early_mrst_spi_putc(char c)
204{
205 unsigned int timeout;
206 u32 sr;
207
208 timeout = MRST_SPI_TIMEOUT;
209 /* Early putc needs to make sure the TX FIFO is not full */
210 while (--timeout) {
211 sr = dw_readl(pspi, sr);
212 if (!(sr & SR_TF_NOT_FULL))
213 cpu_relax();
214 else
215 break;
216 }
217
218 if (!timeout)
219 pr_warning("MRST earlycon: timed out\n");
220 else
221 max3110_write_data(c);
222}
223
224/* Early SPI only uses polling mode */
225static void early_mrst_spi_write(struct console *con, const char *str, unsigned n)
226{
227 int i;
228
229 for (i = 0; i < n && *str; i++) {
230 if (*str == '\n')
231 early_mrst_spi_putc('\r');
232 early_mrst_spi_putc(*str);
233 str++;
234 }
235}
236
237struct console early_mrst_console = {
238 .name = "earlymrst",
239 .write = early_mrst_spi_write,
240 .flags = CON_PRINTBUFFER,
241 .index = -1,
242};
243
244/*
245 * Following is the early console based on Medfield HSU (High
246 * Speed UART) device.
247 */
248#define HSU_PORT2_PADDR 0xffa28180
249
250static void __iomem *phsu;
251
252void hsu_early_console_init(void)
253{
254 u8 lcr;
255
256 phsu = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
257 HSU_PORT2_PADDR);
258
259 /* Disable FIFO */
260 writeb(0x0, phsu + UART_FCR);
261
262 /* Set to default 115200 bps, 8n1 */
263 lcr = readb(phsu + UART_LCR);
264 writeb((0x80 | lcr), phsu + UART_LCR);
265 writeb(0x18, phsu + UART_DLL);
266 writeb(lcr, phsu + UART_LCR);
267 writel(0x3600, phsu + UART_MUL*4);
268
269 writeb(0x8, phsu + UART_MCR);
270 writeb(0x7, phsu + UART_FCR);
271 writeb(0x3, phsu + UART_LCR);
272
273 /* Clear IRQ status */
274 readb(phsu + UART_LSR);
275 readb(phsu + UART_RX);
276 readb(phsu + UART_IIR);
277 readb(phsu + UART_MSR);
278
279 /* Enable FIFO */
280 writeb(0x7, phsu + UART_FCR);
281}
282
283#define BOTH_EMPTY (UART_LSR_TEMT | UART_LSR_THRE)
284
285static void early_hsu_putc(char ch)
286{
287 unsigned int timeout = 10000; /* 10ms */
288 u8 status;
289
290 while (--timeout) {
291 status = readb(phsu + UART_LSR);
292 if (status & BOTH_EMPTY)
293 break;
294 udelay(1);
295 }
296
297 /* Only write the char when there was no timeout */
298 if (timeout)
299 writeb(ch, phsu + UART_TX);
300}
301
302static void early_hsu_write(struct console *con, const char *str, unsigned n)
303{
304 int i;
305
306 for (i = 0; i < n && *str; i++) {
307 if (*str == '\n')
308 early_hsu_putc('\r');
309 early_hsu_putc(*str);
310 str++;
311 }
312}
313
314struct console early_hsu_console = {
315 .name = "earlyhsu",
316 .write = early_hsu_write,
317 .flags = CON_PRINTBUFFER,
318 .index = -1,
319};
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 227d00920d2f..9fb188d7bc76 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -115,8 +115,7 @@
115 115
116 /* unfortunately push/pop can't be no-op */ 116 /* unfortunately push/pop can't be no-op */
117.macro PUSH_GS 117.macro PUSH_GS
118 pushl $0 118 pushl_cfi $0
119 CFI_ADJUST_CFA_OFFSET 4
120.endm 119.endm
121.macro POP_GS pop=0 120.macro POP_GS pop=0
122 addl $(4 + \pop), %esp 121 addl $(4 + \pop), %esp
@@ -140,14 +139,12 @@
140#else /* CONFIG_X86_32_LAZY_GS */ 139#else /* CONFIG_X86_32_LAZY_GS */
141 140
142.macro PUSH_GS 141.macro PUSH_GS
143 pushl %gs 142 pushl_cfi %gs
144 CFI_ADJUST_CFA_OFFSET 4
145 /*CFI_REL_OFFSET gs, 0*/ 143 /*CFI_REL_OFFSET gs, 0*/
146.endm 144.endm
147 145
148.macro POP_GS pop=0 146.macro POP_GS pop=0
14998: popl %gs 14798: popl_cfi %gs
150 CFI_ADJUST_CFA_OFFSET -4
151 /*CFI_RESTORE gs*/ 148 /*CFI_RESTORE gs*/
152 .if \pop <> 0 149 .if \pop <> 0
153 add $\pop, %esp 150 add $\pop, %esp
@@ -195,35 +192,25 @@
195.macro SAVE_ALL 192.macro SAVE_ALL
196 cld 193 cld
197 PUSH_GS 194 PUSH_GS
198 pushl %fs 195 pushl_cfi %fs
199 CFI_ADJUST_CFA_OFFSET 4
200 /*CFI_REL_OFFSET fs, 0;*/ 196 /*CFI_REL_OFFSET fs, 0;*/
201 pushl %es 197 pushl_cfi %es
202 CFI_ADJUST_CFA_OFFSET 4
203 /*CFI_REL_OFFSET es, 0;*/ 198 /*CFI_REL_OFFSET es, 0;*/
204 pushl %ds 199 pushl_cfi %ds
205 CFI_ADJUST_CFA_OFFSET 4
206 /*CFI_REL_OFFSET ds, 0;*/ 200 /*CFI_REL_OFFSET ds, 0;*/
207 pushl %eax 201 pushl_cfi %eax
208 CFI_ADJUST_CFA_OFFSET 4
209 CFI_REL_OFFSET eax, 0 202 CFI_REL_OFFSET eax, 0
210 pushl %ebp 203 pushl_cfi %ebp
211 CFI_ADJUST_CFA_OFFSET 4
212 CFI_REL_OFFSET ebp, 0 204 CFI_REL_OFFSET ebp, 0
213 pushl %edi 205 pushl_cfi %edi
214 CFI_ADJUST_CFA_OFFSET 4
215 CFI_REL_OFFSET edi, 0 206 CFI_REL_OFFSET edi, 0
216 pushl %esi 207 pushl_cfi %esi
217 CFI_ADJUST_CFA_OFFSET 4
218 CFI_REL_OFFSET esi, 0 208 CFI_REL_OFFSET esi, 0
219 pushl %edx 209 pushl_cfi %edx
220 CFI_ADJUST_CFA_OFFSET 4
221 CFI_REL_OFFSET edx, 0 210 CFI_REL_OFFSET edx, 0
222 pushl %ecx 211 pushl_cfi %ecx
223 CFI_ADJUST_CFA_OFFSET 4
224 CFI_REL_OFFSET ecx, 0 212 CFI_REL_OFFSET ecx, 0
225 pushl %ebx 213 pushl_cfi %ebx
226 CFI_ADJUST_CFA_OFFSET 4
227 CFI_REL_OFFSET ebx, 0 214 CFI_REL_OFFSET ebx, 0
228 movl $(__USER_DS), %edx 215 movl $(__USER_DS), %edx
229 movl %edx, %ds 216 movl %edx, %ds
@@ -234,39 +221,29 @@
234.endm 221.endm
235 222
236.macro RESTORE_INT_REGS 223.macro RESTORE_INT_REGS
237 popl %ebx 224 popl_cfi %ebx
238 CFI_ADJUST_CFA_OFFSET -4
239 CFI_RESTORE ebx 225 CFI_RESTORE ebx
240 popl %ecx 226 popl_cfi %ecx
241 CFI_ADJUST_CFA_OFFSET -4
242 CFI_RESTORE ecx 227 CFI_RESTORE ecx
243 popl %edx 228 popl_cfi %edx
244 CFI_ADJUST_CFA_OFFSET -4
245 CFI_RESTORE edx 229 CFI_RESTORE edx
246 popl %esi 230 popl_cfi %esi
247 CFI_ADJUST_CFA_OFFSET -4
248 CFI_RESTORE esi 231 CFI_RESTORE esi
249 popl %edi 232 popl_cfi %edi
250 CFI_ADJUST_CFA_OFFSET -4
251 CFI_RESTORE edi 233 CFI_RESTORE edi
252 popl %ebp 234 popl_cfi %ebp
253 CFI_ADJUST_CFA_OFFSET -4
254 CFI_RESTORE ebp 235 CFI_RESTORE ebp
255 popl %eax 236 popl_cfi %eax
256 CFI_ADJUST_CFA_OFFSET -4
257 CFI_RESTORE eax 237 CFI_RESTORE eax
258.endm 238.endm
259 239
260.macro RESTORE_REGS pop=0 240.macro RESTORE_REGS pop=0
261 RESTORE_INT_REGS 241 RESTORE_INT_REGS
2621: popl %ds 2421: popl_cfi %ds
263 CFI_ADJUST_CFA_OFFSET -4
264 /*CFI_RESTORE ds;*/ 243 /*CFI_RESTORE ds;*/
2652: popl %es 2442: popl_cfi %es
266 CFI_ADJUST_CFA_OFFSET -4
267 /*CFI_RESTORE es;*/ 245 /*CFI_RESTORE es;*/
2683: popl %fs 2463: popl_cfi %fs
269 CFI_ADJUST_CFA_OFFSET -4
270 /*CFI_RESTORE fs;*/ 247 /*CFI_RESTORE fs;*/
271 POP_GS \pop 248 POP_GS \pop
272.pushsection .fixup, "ax" 249.pushsection .fixup, "ax"
@@ -320,16 +297,12 @@
320 297
321ENTRY(ret_from_fork) 298ENTRY(ret_from_fork)
322 CFI_STARTPROC 299 CFI_STARTPROC
323 pushl %eax 300 pushl_cfi %eax
324 CFI_ADJUST_CFA_OFFSET 4
325 call schedule_tail 301 call schedule_tail
326 GET_THREAD_INFO(%ebp) 302 GET_THREAD_INFO(%ebp)
327 popl %eax 303 popl_cfi %eax
328 CFI_ADJUST_CFA_OFFSET -4 304 pushl_cfi $0x0202 # Reset kernel eflags
329 pushl $0x0202 # Reset kernel eflags 305 popfl_cfi
330 CFI_ADJUST_CFA_OFFSET 4
331 popfl
332 CFI_ADJUST_CFA_OFFSET -4
333 jmp syscall_exit 306 jmp syscall_exit
334 CFI_ENDPROC 307 CFI_ENDPROC
335END(ret_from_fork) 308END(ret_from_fork)
@@ -409,29 +382,23 @@ sysenter_past_esp:
409 * enough kernel state to call TRACE_IRQS_OFF can be called - but 382 * enough kernel state to call TRACE_IRQS_OFF can be called - but
410 * we immediately enable interrupts at that point anyway. 383 * we immediately enable interrupts at that point anyway.
411 */ 384 */
412 pushl $(__USER_DS) 385 pushl_cfi $(__USER_DS)
413 CFI_ADJUST_CFA_OFFSET 4
414 /*CFI_REL_OFFSET ss, 0*/ 386 /*CFI_REL_OFFSET ss, 0*/
415 pushl %ebp 387 pushl_cfi %ebp
416 CFI_ADJUST_CFA_OFFSET 4
417 CFI_REL_OFFSET esp, 0 388 CFI_REL_OFFSET esp, 0
418 pushfl 389 pushfl_cfi
419 orl $X86_EFLAGS_IF, (%esp) 390 orl $X86_EFLAGS_IF, (%esp)
420 CFI_ADJUST_CFA_OFFSET 4 391 pushl_cfi $(__USER_CS)
421 pushl $(__USER_CS)
422 CFI_ADJUST_CFA_OFFSET 4
423 /*CFI_REL_OFFSET cs, 0*/ 392 /*CFI_REL_OFFSET cs, 0*/
424 /* 393 /*
425 * Push current_thread_info()->sysenter_return to the stack. 394 * Push current_thread_info()->sysenter_return to the stack.
426 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words 395 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
427 * pushed above; +8 corresponds to copy_thread's esp0 setting. 396 * pushed above; +8 corresponds to copy_thread's esp0 setting.
428 */ 397 */
429 pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp) 398 pushl_cfi (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp)
430 CFI_ADJUST_CFA_OFFSET 4
431 CFI_REL_OFFSET eip, 0 399 CFI_REL_OFFSET eip, 0
432 400
433 pushl %eax 401 pushl_cfi %eax
434 CFI_ADJUST_CFA_OFFSET 4
435 SAVE_ALL 402 SAVE_ALL
436 ENABLE_INTERRUPTS(CLBR_NONE) 403 ENABLE_INTERRUPTS(CLBR_NONE)
437 404
@@ -486,8 +453,7 @@ sysenter_audit:
486 movl %eax,%edx /* 2nd arg: syscall number */ 453 movl %eax,%edx /* 2nd arg: syscall number */
487 movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */ 454 movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */
488 call audit_syscall_entry 455 call audit_syscall_entry
489 pushl %ebx 456 pushl_cfi %ebx
490 CFI_ADJUST_CFA_OFFSET 4
491 movl PT_EAX(%esp),%eax /* reload syscall number */ 457 movl PT_EAX(%esp),%eax /* reload syscall number */
492 jmp sysenter_do_call 458 jmp sysenter_do_call
493 459
@@ -529,8 +495,7 @@ ENDPROC(ia32_sysenter_target)
529 # system call handler stub 495 # system call handler stub
530ENTRY(system_call) 496ENTRY(system_call)
531 RING0_INT_FRAME # can't unwind into user space anyway 497 RING0_INT_FRAME # can't unwind into user space anyway
532 pushl %eax # save orig_eax 498 pushl_cfi %eax # save orig_eax
533 CFI_ADJUST_CFA_OFFSET 4
534 SAVE_ALL 499 SAVE_ALL
535 GET_THREAD_INFO(%ebp) 500 GET_THREAD_INFO(%ebp)
536 # system call tracing in operation / emulation 501 # system call tracing in operation / emulation
@@ -566,7 +531,6 @@ restore_all_notrace:
566 je ldt_ss # returning to user-space with LDT SS 531 je ldt_ss # returning to user-space with LDT SS
567restore_nocheck: 532restore_nocheck:
568 RESTORE_REGS 4 # skip orig_eax/error_code 533 RESTORE_REGS 4 # skip orig_eax/error_code
569 CFI_ADJUST_CFA_OFFSET -4
570irq_return: 534irq_return:
571 INTERRUPT_RETURN 535 INTERRUPT_RETURN
572.section .fixup,"ax" 536.section .fixup,"ax"
@@ -619,10 +583,8 @@ ldt_ss:
619 shr $16, %edx 583 shr $16, %edx
620 mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */ 584 mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */
621 mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */ 585 mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */
622 pushl $__ESPFIX_SS 586 pushl_cfi $__ESPFIX_SS
623 CFI_ADJUST_CFA_OFFSET 4 587 pushl_cfi %eax /* new kernel esp */
624 push %eax /* new kernel esp */
625 CFI_ADJUST_CFA_OFFSET 4
626 /* Disable interrupts, but do not irqtrace this section: we 588 /* Disable interrupts, but do not irqtrace this section: we
627 * will soon execute iret and the tracer was already set to 589 * will soon execute iret and the tracer was already set to
628 * the irqstate after the iret */ 590 * the irqstate after the iret */
@@ -666,11 +628,9 @@ work_notifysig: # deal with pending signals and
666 628
667 ALIGN 629 ALIGN
668work_notifysig_v86: 630work_notifysig_v86:
669 pushl %ecx # save ti_flags for do_notify_resume 631 pushl_cfi %ecx # save ti_flags for do_notify_resume
670 CFI_ADJUST_CFA_OFFSET 4
671 call save_v86_state # %eax contains pt_regs pointer 632 call save_v86_state # %eax contains pt_regs pointer
672 popl %ecx 633 popl_cfi %ecx
673 CFI_ADJUST_CFA_OFFSET -4
674 movl %eax, %esp 634 movl %eax, %esp
675#else 635#else
676 movl %esp, %eax 636 movl %esp, %eax
@@ -750,14 +710,18 @@ ptregs_##name: \
750#define PTREGSCALL3(name) \ 710#define PTREGSCALL3(name) \
751 ALIGN; \ 711 ALIGN; \
752ptregs_##name: \ 712ptregs_##name: \
713 CFI_STARTPROC; \
753 leal 4(%esp),%eax; \ 714 leal 4(%esp),%eax; \
754 pushl %eax; \ 715 pushl_cfi %eax; \
755 movl PT_EDX(%eax),%ecx; \ 716 movl PT_EDX(%eax),%ecx; \
756 movl PT_ECX(%eax),%edx; \ 717 movl PT_ECX(%eax),%edx; \
757 movl PT_EBX(%eax),%eax; \ 718 movl PT_EBX(%eax),%eax; \
758 call sys_##name; \ 719 call sys_##name; \
759 addl $4,%esp; \ 720 addl $4,%esp; \
760 ret 721 CFI_ADJUST_CFA_OFFSET -4; \
722 ret; \
723 CFI_ENDPROC; \
724ENDPROC(ptregs_##name)
761 725
762PTREGSCALL1(iopl) 726PTREGSCALL1(iopl)
763PTREGSCALL0(fork) 727PTREGSCALL0(fork)
@@ -772,15 +736,19 @@ PTREGSCALL1(vm86old)
772/* Clone is an oddball. The 4th arg is in %edi */ 736/* Clone is an oddball. The 4th arg is in %edi */
773 ALIGN; 737 ALIGN;
774ptregs_clone: 738ptregs_clone:
739 CFI_STARTPROC
775 leal 4(%esp),%eax 740 leal 4(%esp),%eax
776 pushl %eax 741 pushl_cfi %eax
777 pushl PT_EDI(%eax) 742 pushl_cfi PT_EDI(%eax)
778 movl PT_EDX(%eax),%ecx 743 movl PT_EDX(%eax),%ecx
779 movl PT_ECX(%eax),%edx 744 movl PT_ECX(%eax),%edx
780 movl PT_EBX(%eax),%eax 745 movl PT_EBX(%eax),%eax
781 call sys_clone 746 call sys_clone
782 addl $8,%esp 747 addl $8,%esp
748 CFI_ADJUST_CFA_OFFSET -8
783 ret 749 ret
750 CFI_ENDPROC
751ENDPROC(ptregs_clone)
784 752
785.macro FIXUP_ESPFIX_STACK 753.macro FIXUP_ESPFIX_STACK
786/* 754/*
@@ -795,10 +763,8 @@ ptregs_clone:
795 mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ 763 mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
796 shl $16, %eax 764 shl $16, %eax
797 addl %esp, %eax /* the adjusted stack pointer */ 765 addl %esp, %eax /* the adjusted stack pointer */
798 pushl $__KERNEL_DS 766 pushl_cfi $__KERNEL_DS
799 CFI_ADJUST_CFA_OFFSET 4 767 pushl_cfi %eax
800 pushl %eax
801 CFI_ADJUST_CFA_OFFSET 4
802 lss (%esp), %esp /* switch to the normal stack segment */ 768 lss (%esp), %esp /* switch to the normal stack segment */
803 CFI_ADJUST_CFA_OFFSET -8 769 CFI_ADJUST_CFA_OFFSET -8
804.endm 770.endm
@@ -835,8 +801,7 @@ vector=FIRST_EXTERNAL_VECTOR
835 .if vector <> FIRST_EXTERNAL_VECTOR 801 .if vector <> FIRST_EXTERNAL_VECTOR
836 CFI_ADJUST_CFA_OFFSET -4 802 CFI_ADJUST_CFA_OFFSET -4
837 .endif 803 .endif
8381: pushl $(~vector+0x80) /* Note: always in signed byte range */ 8041: pushl_cfi $(~vector+0x80) /* Note: always in signed byte range */
839 CFI_ADJUST_CFA_OFFSET 4
840 .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6 805 .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
841 jmp 2f 806 jmp 2f
842 .endif 807 .endif
@@ -876,8 +841,7 @@ ENDPROC(common_interrupt)
876#define BUILD_INTERRUPT3(name, nr, fn) \ 841#define BUILD_INTERRUPT3(name, nr, fn) \
877ENTRY(name) \ 842ENTRY(name) \
878 RING0_INT_FRAME; \ 843 RING0_INT_FRAME; \
879 pushl $~(nr); \ 844 pushl_cfi $~(nr); \
880 CFI_ADJUST_CFA_OFFSET 4; \
881 SAVE_ALL; \ 845 SAVE_ALL; \
882 TRACE_IRQS_OFF \ 846 TRACE_IRQS_OFF \
883 movl %esp,%eax; \ 847 movl %esp,%eax; \
@@ -893,21 +857,18 @@ ENDPROC(name)
893 857
894ENTRY(coprocessor_error) 858ENTRY(coprocessor_error)
895 RING0_INT_FRAME 859 RING0_INT_FRAME
896 pushl $0 860 pushl_cfi $0
897 CFI_ADJUST_CFA_OFFSET 4 861 pushl_cfi $do_coprocessor_error
898 pushl $do_coprocessor_error
899 CFI_ADJUST_CFA_OFFSET 4
900 jmp error_code 862 jmp error_code
901 CFI_ENDPROC 863 CFI_ENDPROC
902END(coprocessor_error) 864END(coprocessor_error)
903 865
904ENTRY(simd_coprocessor_error) 866ENTRY(simd_coprocessor_error)
905 RING0_INT_FRAME 867 RING0_INT_FRAME
906 pushl $0 868 pushl_cfi $0
907 CFI_ADJUST_CFA_OFFSET 4
908#ifdef CONFIG_X86_INVD_BUG 869#ifdef CONFIG_X86_INVD_BUG
909 /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ 870 /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
910661: pushl $do_general_protection 871661: pushl_cfi $do_general_protection
911662: 872662:
912.section .altinstructions,"a" 873.section .altinstructions,"a"
913 .balign 4 874 .balign 4
@@ -922,19 +883,16 @@ ENTRY(simd_coprocessor_error)
922664: 883664:
923.previous 884.previous
924#else 885#else
925 pushl $do_simd_coprocessor_error 886 pushl_cfi $do_simd_coprocessor_error
926#endif 887#endif
927 CFI_ADJUST_CFA_OFFSET 4
928 jmp error_code 888 jmp error_code
929 CFI_ENDPROC 889 CFI_ENDPROC
930END(simd_coprocessor_error) 890END(simd_coprocessor_error)
931 891
932ENTRY(device_not_available) 892ENTRY(device_not_available)
933 RING0_INT_FRAME 893 RING0_INT_FRAME
934 pushl $-1 # mark this as an int 894 pushl_cfi $-1 # mark this as an int
935 CFI_ADJUST_CFA_OFFSET 4 895 pushl_cfi $do_device_not_available
936 pushl $do_device_not_available
937 CFI_ADJUST_CFA_OFFSET 4
938 jmp error_code 896 jmp error_code
939 CFI_ENDPROC 897 CFI_ENDPROC
940END(device_not_available) 898END(device_not_available)
@@ -956,82 +914,68 @@ END(native_irq_enable_sysexit)
956 914
957ENTRY(overflow) 915ENTRY(overflow)
958 RING0_INT_FRAME 916 RING0_INT_FRAME
959 pushl $0 917 pushl_cfi $0
960 CFI_ADJUST_CFA_OFFSET 4 918 pushl_cfi $do_overflow
961 pushl $do_overflow
962 CFI_ADJUST_CFA_OFFSET 4
963 jmp error_code 919 jmp error_code
964 CFI_ENDPROC 920 CFI_ENDPROC
965END(overflow) 921END(overflow)
966 922
967ENTRY(bounds) 923ENTRY(bounds)
968 RING0_INT_FRAME 924 RING0_INT_FRAME
969 pushl $0 925 pushl_cfi $0
970 CFI_ADJUST_CFA_OFFSET 4 926 pushl_cfi $do_bounds
971 pushl $do_bounds
972 CFI_ADJUST_CFA_OFFSET 4
973 jmp error_code 927 jmp error_code
974 CFI_ENDPROC 928 CFI_ENDPROC
975END(bounds) 929END(bounds)
976 930
977ENTRY(invalid_op) 931ENTRY(invalid_op)
978 RING0_INT_FRAME 932 RING0_INT_FRAME
979 pushl $0 933 pushl_cfi $0
980 CFI_ADJUST_CFA_OFFSET 4 934 pushl_cfi $do_invalid_op
981 pushl $do_invalid_op
982 CFI_ADJUST_CFA_OFFSET 4
983 jmp error_code 935 jmp error_code
984 CFI_ENDPROC 936 CFI_ENDPROC
985END(invalid_op) 937END(invalid_op)
986 938
987ENTRY(coprocessor_segment_overrun) 939ENTRY(coprocessor_segment_overrun)
988 RING0_INT_FRAME 940 RING0_INT_FRAME
989 pushl $0 941 pushl_cfi $0
990 CFI_ADJUST_CFA_OFFSET 4 942 pushl_cfi $do_coprocessor_segment_overrun
991 pushl $do_coprocessor_segment_overrun
992 CFI_ADJUST_CFA_OFFSET 4
993 jmp error_code 943 jmp error_code
994 CFI_ENDPROC 944 CFI_ENDPROC
995END(coprocessor_segment_overrun) 945END(coprocessor_segment_overrun)
996 946
997ENTRY(invalid_TSS) 947ENTRY(invalid_TSS)
998 RING0_EC_FRAME 948 RING0_EC_FRAME
999 pushl $do_invalid_TSS 949 pushl_cfi $do_invalid_TSS
1000 CFI_ADJUST_CFA_OFFSET 4
1001 jmp error_code 950 jmp error_code
1002 CFI_ENDPROC 951 CFI_ENDPROC
1003END(invalid_TSS) 952END(invalid_TSS)
1004 953
1005ENTRY(segment_not_present) 954ENTRY(segment_not_present)
1006 RING0_EC_FRAME 955 RING0_EC_FRAME
1007 pushl $do_segment_not_present 956 pushl_cfi $do_segment_not_present
1008 CFI_ADJUST_CFA_OFFSET 4
1009 jmp error_code 957 jmp error_code
1010 CFI_ENDPROC 958 CFI_ENDPROC
1011END(segment_not_present) 959END(segment_not_present)
1012 960
1013ENTRY(stack_segment) 961ENTRY(stack_segment)
1014 RING0_EC_FRAME 962 RING0_EC_FRAME
1015 pushl $do_stack_segment 963 pushl_cfi $do_stack_segment
1016 CFI_ADJUST_CFA_OFFSET 4
1017 jmp error_code 964 jmp error_code
1018 CFI_ENDPROC 965 CFI_ENDPROC
1019END(stack_segment) 966END(stack_segment)
1020 967
1021ENTRY(alignment_check) 968ENTRY(alignment_check)
1022 RING0_EC_FRAME 969 RING0_EC_FRAME
1023 pushl $do_alignment_check 970 pushl_cfi $do_alignment_check
1024 CFI_ADJUST_CFA_OFFSET 4
1025 jmp error_code 971 jmp error_code
1026 CFI_ENDPROC 972 CFI_ENDPROC
1027END(alignment_check) 973END(alignment_check)
1028 974
1029ENTRY(divide_error) 975ENTRY(divide_error)
1030 RING0_INT_FRAME 976 RING0_INT_FRAME
1031 pushl $0 # no error code 977 pushl_cfi $0 # no error code
1032 CFI_ADJUST_CFA_OFFSET 4 978 pushl_cfi $do_divide_error
1033 pushl $do_divide_error
1034 CFI_ADJUST_CFA_OFFSET 4
1035 jmp error_code 979 jmp error_code
1036 CFI_ENDPROC 980 CFI_ENDPROC
1037END(divide_error) 981END(divide_error)
@@ -1039,10 +983,8 @@ END(divide_error)
1039#ifdef CONFIG_X86_MCE 983#ifdef CONFIG_X86_MCE
1040ENTRY(machine_check) 984ENTRY(machine_check)
1041 RING0_INT_FRAME 985 RING0_INT_FRAME
1042 pushl $0 986 pushl_cfi $0
1043 CFI_ADJUST_CFA_OFFSET 4 987 pushl_cfi machine_check_vector
1044 pushl machine_check_vector
1045 CFI_ADJUST_CFA_OFFSET 4
1046 jmp error_code 988 jmp error_code
1047 CFI_ENDPROC 989 CFI_ENDPROC
1048END(machine_check) 990END(machine_check)
@@ -1050,10 +992,8 @@ END(machine_check)
1050 992
1051ENTRY(spurious_interrupt_bug) 993ENTRY(spurious_interrupt_bug)
1052 RING0_INT_FRAME 994 RING0_INT_FRAME
1053 pushl $0 995 pushl_cfi $0
1054 CFI_ADJUST_CFA_OFFSET 4 996 pushl_cfi $do_spurious_interrupt_bug
1055 pushl $do_spurious_interrupt_bug
1056 CFI_ADJUST_CFA_OFFSET 4
1057 jmp error_code 997 jmp error_code
1058 CFI_ENDPROC 998 CFI_ENDPROC
1059END(spurious_interrupt_bug) 999END(spurious_interrupt_bug)
@@ -1084,8 +1024,7 @@ ENTRY(xen_sysenter_target)
1084 1024
1085ENTRY(xen_hypervisor_callback) 1025ENTRY(xen_hypervisor_callback)
1086 CFI_STARTPROC 1026 CFI_STARTPROC
1087 pushl $0 1027 pushl_cfi $0
1088 CFI_ADJUST_CFA_OFFSET 4
1089 SAVE_ALL 1028 SAVE_ALL
1090 TRACE_IRQS_OFF 1029 TRACE_IRQS_OFF
1091 1030
@@ -1121,23 +1060,20 @@ ENDPROC(xen_hypervisor_callback)
1121# We distinguish between categories by maintaining a status value in EAX. 1060# We distinguish between categories by maintaining a status value in EAX.
1122ENTRY(xen_failsafe_callback) 1061ENTRY(xen_failsafe_callback)
1123 CFI_STARTPROC 1062 CFI_STARTPROC
1124 pushl %eax 1063 pushl_cfi %eax
1125 CFI_ADJUST_CFA_OFFSET 4
1126 movl $1,%eax 1064 movl $1,%eax
11271: mov 4(%esp),%ds 10651: mov 4(%esp),%ds
11282: mov 8(%esp),%es 10662: mov 8(%esp),%es
11293: mov 12(%esp),%fs 10673: mov 12(%esp),%fs
11304: mov 16(%esp),%gs 10684: mov 16(%esp),%gs
1131 testl %eax,%eax 1069 testl %eax,%eax
1132 popl %eax 1070 popl_cfi %eax
1133 CFI_ADJUST_CFA_OFFSET -4
1134 lea 16(%esp),%esp 1071 lea 16(%esp),%esp
1135 CFI_ADJUST_CFA_OFFSET -16 1072 CFI_ADJUST_CFA_OFFSET -16
1136 jz 5f 1073 jz 5f
1137 addl $16,%esp 1074 addl $16,%esp
1138 jmp iret_exc # EAX != 0 => Category 2 (Bad IRET) 1075 jmp iret_exc # EAX != 0 => Category 2 (Bad IRET)
11395: pushl $0 # EAX == 0 => Category 1 (Bad segment) 10765: pushl_cfi $0 # EAX == 0 => Category 1 (Bad segment)
1140 CFI_ADJUST_CFA_OFFSET 4
1141 SAVE_ALL 1077 SAVE_ALL
1142 jmp ret_from_exception 1078 jmp ret_from_exception
1143 CFI_ENDPROC 1079 CFI_ENDPROC
@@ -1287,40 +1223,29 @@ syscall_table_size=(.-sys_call_table)
1287 1223
1288ENTRY(page_fault) 1224ENTRY(page_fault)
1289 RING0_EC_FRAME 1225 RING0_EC_FRAME
1290 pushl $do_page_fault 1226 pushl_cfi $do_page_fault
1291 CFI_ADJUST_CFA_OFFSET 4
1292 ALIGN 1227 ALIGN
1293error_code: 1228error_code:
1294 /* the function address is in %gs's slot on the stack */ 1229 /* the function address is in %gs's slot on the stack */
1295 pushl %fs 1230 pushl_cfi %fs
1296 CFI_ADJUST_CFA_OFFSET 4
1297 /*CFI_REL_OFFSET fs, 0*/ 1231 /*CFI_REL_OFFSET fs, 0*/
1298 pushl %es 1232 pushl_cfi %es
1299 CFI_ADJUST_CFA_OFFSET 4
1300 /*CFI_REL_OFFSET es, 0*/ 1233 /*CFI_REL_OFFSET es, 0*/
1301 pushl %ds 1234 pushl_cfi %ds
1302 CFI_ADJUST_CFA_OFFSET 4
1303 /*CFI_REL_OFFSET ds, 0*/ 1235 /*CFI_REL_OFFSET ds, 0*/
1304 pushl %eax 1236 pushl_cfi %eax
1305 CFI_ADJUST_CFA_OFFSET 4
1306 CFI_REL_OFFSET eax, 0 1237 CFI_REL_OFFSET eax, 0
1307 pushl %ebp 1238 pushl_cfi %ebp
1308 CFI_ADJUST_CFA_OFFSET 4
1309 CFI_REL_OFFSET ebp, 0 1239 CFI_REL_OFFSET ebp, 0
1310 pushl %edi 1240 pushl_cfi %edi
1311 CFI_ADJUST_CFA_OFFSET 4
1312 CFI_REL_OFFSET edi, 0 1241 CFI_REL_OFFSET edi, 0
1313 pushl %esi 1242 pushl_cfi %esi
1314 CFI_ADJUST_CFA_OFFSET 4
1315 CFI_REL_OFFSET esi, 0 1243 CFI_REL_OFFSET esi, 0
1316 pushl %edx 1244 pushl_cfi %edx
1317 CFI_ADJUST_CFA_OFFSET 4
1318 CFI_REL_OFFSET edx, 0 1245 CFI_REL_OFFSET edx, 0
1319 pushl %ecx 1246 pushl_cfi %ecx
1320 CFI_ADJUST_CFA_OFFSET 4
1321 CFI_REL_OFFSET ecx, 0 1247 CFI_REL_OFFSET ecx, 0
1322 pushl %ebx 1248 pushl_cfi %ebx
1323 CFI_ADJUST_CFA_OFFSET 4
1324 CFI_REL_OFFSET ebx, 0 1249 CFI_REL_OFFSET ebx, 0
1325 cld 1250 cld
1326 movl $(__KERNEL_PERCPU), %ecx 1251 movl $(__KERNEL_PERCPU), %ecx
@@ -1362,12 +1287,9 @@ END(page_fault)
1362 movl TSS_sysenter_sp0 + \offset(%esp), %esp 1287 movl TSS_sysenter_sp0 + \offset(%esp), %esp
1363 CFI_DEF_CFA esp, 0 1288 CFI_DEF_CFA esp, 0
1364 CFI_UNDEFINED eip 1289 CFI_UNDEFINED eip
1365 pushfl 1290 pushfl_cfi
1366 CFI_ADJUST_CFA_OFFSET 4 1291 pushl_cfi $__KERNEL_CS
1367 pushl $__KERNEL_CS 1292 pushl_cfi $sysenter_past_esp
1368 CFI_ADJUST_CFA_OFFSET 4
1369 pushl $sysenter_past_esp
1370 CFI_ADJUST_CFA_OFFSET 4
1371 CFI_REL_OFFSET eip, 0 1293 CFI_REL_OFFSET eip, 0
1372.endm 1294.endm
1373 1295
@@ -1377,8 +1299,7 @@ ENTRY(debug)
1377 jne debug_stack_correct 1299 jne debug_stack_correct
1378 FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn 1300 FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
1379debug_stack_correct: 1301debug_stack_correct:
1380 pushl $-1 # mark this as an int 1302 pushl_cfi $-1 # mark this as an int
1381 CFI_ADJUST_CFA_OFFSET 4
1382 SAVE_ALL 1303 SAVE_ALL
1383 TRACE_IRQS_OFF 1304 TRACE_IRQS_OFF
1384 xorl %edx,%edx # error code 0 1305 xorl %edx,%edx # error code 0
@@ -1398,32 +1319,27 @@ END(debug)
1398 */ 1319 */
1399ENTRY(nmi) 1320ENTRY(nmi)
1400 RING0_INT_FRAME 1321 RING0_INT_FRAME
1401 pushl %eax 1322 pushl_cfi %eax
1402 CFI_ADJUST_CFA_OFFSET 4
1403 movl %ss, %eax 1323 movl %ss, %eax
1404 cmpw $__ESPFIX_SS, %ax 1324 cmpw $__ESPFIX_SS, %ax
1405 popl %eax 1325 popl_cfi %eax
1406 CFI_ADJUST_CFA_OFFSET -4
1407 je nmi_espfix_stack 1326 je nmi_espfix_stack
1408 cmpl $ia32_sysenter_target,(%esp) 1327 cmpl $ia32_sysenter_target,(%esp)
1409 je nmi_stack_fixup 1328 je nmi_stack_fixup
1410 pushl %eax 1329 pushl_cfi %eax
1411 CFI_ADJUST_CFA_OFFSET 4
1412 movl %esp,%eax 1330 movl %esp,%eax
1413 /* Do not access memory above the end of our stack page, 1331 /* Do not access memory above the end of our stack page,
1414 * it might not exist. 1332 * it might not exist.
1415 */ 1333 */
1416 andl $(THREAD_SIZE-1),%eax 1334 andl $(THREAD_SIZE-1),%eax
1417 cmpl $(THREAD_SIZE-20),%eax 1335 cmpl $(THREAD_SIZE-20),%eax
1418 popl %eax 1336 popl_cfi %eax
1419 CFI_ADJUST_CFA_OFFSET -4
1420 jae nmi_stack_correct 1337 jae nmi_stack_correct
1421 cmpl $ia32_sysenter_target,12(%esp) 1338 cmpl $ia32_sysenter_target,12(%esp)
1422 je nmi_debug_stack_check 1339 je nmi_debug_stack_check
1423nmi_stack_correct: 1340nmi_stack_correct:
1424 /* We have a RING0_INT_FRAME here */ 1341 /* We have a RING0_INT_FRAME here */
1425 pushl %eax 1342 pushl_cfi %eax
1426 CFI_ADJUST_CFA_OFFSET 4
1427 SAVE_ALL 1343 SAVE_ALL
1428 xorl %edx,%edx # zero error code 1344 xorl %edx,%edx # zero error code
1429 movl %esp,%eax # pt_regs pointer 1345 movl %esp,%eax # pt_regs pointer
@@ -1452,18 +1368,14 @@ nmi_espfix_stack:
1452 * 1368 *
1453 * create the pointer to lss back 1369 * create the pointer to lss back
1454 */ 1370 */
1455 pushl %ss 1371 pushl_cfi %ss
1456 CFI_ADJUST_CFA_OFFSET 4 1372 pushl_cfi %esp
1457 pushl %esp
1458 CFI_ADJUST_CFA_OFFSET 4
1459 addl $4, (%esp) 1373 addl $4, (%esp)
1460 /* copy the iret frame of 12 bytes */ 1374 /* copy the iret frame of 12 bytes */
1461 .rept 3 1375 .rept 3
1462 pushl 16(%esp) 1376 pushl_cfi 16(%esp)
1463 CFI_ADJUST_CFA_OFFSET 4
1464 .endr 1377 .endr
1465 pushl %eax 1378 pushl_cfi %eax
1466 CFI_ADJUST_CFA_OFFSET 4
1467 SAVE_ALL 1379 SAVE_ALL
1468 FIXUP_ESPFIX_STACK # %eax == %esp 1380 FIXUP_ESPFIX_STACK # %eax == %esp
1469 xorl %edx,%edx # zero error code 1381 xorl %edx,%edx # zero error code
@@ -1477,8 +1389,7 @@ END(nmi)
1477 1389
1478ENTRY(int3) 1390ENTRY(int3)
1479 RING0_INT_FRAME 1391 RING0_INT_FRAME
1480 pushl $-1 # mark this as an int 1392 pushl_cfi $-1 # mark this as an int
1481 CFI_ADJUST_CFA_OFFSET 4
1482 SAVE_ALL 1393 SAVE_ALL
1483 TRACE_IRQS_OFF 1394 TRACE_IRQS_OFF
1484 xorl %edx,%edx # zero error code 1395 xorl %edx,%edx # zero error code
@@ -1490,8 +1401,7 @@ END(int3)
1490 1401
1491ENTRY(general_protection) 1402ENTRY(general_protection)
1492 RING0_EC_FRAME 1403 RING0_EC_FRAME
1493 pushl $do_general_protection 1404 pushl_cfi $do_general_protection
1494 CFI_ADJUST_CFA_OFFSET 4
1495 jmp error_code 1405 jmp error_code
1496 CFI_ENDPROC 1406 CFI_ENDPROC
1497END(general_protection) 1407END(general_protection)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index c375c79065f8..a7ae7fd1010f 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -213,23 +213,17 @@ ENDPROC(native_usergs_sysret64)
213 .macro FAKE_STACK_FRAME child_rip 213 .macro FAKE_STACK_FRAME child_rip
214 /* push in order ss, rsp, eflags, cs, rip */ 214 /* push in order ss, rsp, eflags, cs, rip */
215 xorl %eax, %eax 215 xorl %eax, %eax
216 pushq $__KERNEL_DS /* ss */ 216 pushq_cfi $__KERNEL_DS /* ss */
217 CFI_ADJUST_CFA_OFFSET 8
218 /*CFI_REL_OFFSET ss,0*/ 217 /*CFI_REL_OFFSET ss,0*/
219 pushq %rax /* rsp */ 218 pushq_cfi %rax /* rsp */
220 CFI_ADJUST_CFA_OFFSET 8
221 CFI_REL_OFFSET rsp,0 219 CFI_REL_OFFSET rsp,0
222 pushq $X86_EFLAGS_IF /* eflags - interrupts on */ 220 pushq_cfi $X86_EFLAGS_IF /* eflags - interrupts on */
223 CFI_ADJUST_CFA_OFFSET 8
224 /*CFI_REL_OFFSET rflags,0*/ 221 /*CFI_REL_OFFSET rflags,0*/
225 pushq $__KERNEL_CS /* cs */ 222 pushq_cfi $__KERNEL_CS /* cs */
226 CFI_ADJUST_CFA_OFFSET 8
227 /*CFI_REL_OFFSET cs,0*/ 223 /*CFI_REL_OFFSET cs,0*/
228 pushq \child_rip /* rip */ 224 pushq_cfi \child_rip /* rip */
229 CFI_ADJUST_CFA_OFFSET 8
230 CFI_REL_OFFSET rip,0 225 CFI_REL_OFFSET rip,0
231 pushq %rax /* orig rax */ 226 pushq_cfi %rax /* orig rax */
232 CFI_ADJUST_CFA_OFFSET 8
233 .endm 227 .endm
234 228
235 .macro UNFAKE_STACK_FRAME 229 .macro UNFAKE_STACK_FRAME
@@ -398,10 +392,8 @@ ENTRY(ret_from_fork)
398 392
399 LOCK ; btr $TIF_FORK,TI_flags(%r8) 393 LOCK ; btr $TIF_FORK,TI_flags(%r8)
400 394
401 push kernel_eflags(%rip) 395 pushq_cfi kernel_eflags(%rip)
402 CFI_ADJUST_CFA_OFFSET 8 396 popfq_cfi # reset kernel eflags
403 popf # reset kernel eflags
404 CFI_ADJUST_CFA_OFFSET -8
405 397
406 call schedule_tail # rdi: 'prev' task parameter 398 call schedule_tail # rdi: 'prev' task parameter
407 399
@@ -521,11 +513,9 @@ sysret_careful:
521 jnc sysret_signal 513 jnc sysret_signal
522 TRACE_IRQS_ON 514 TRACE_IRQS_ON
523 ENABLE_INTERRUPTS(CLBR_NONE) 515 ENABLE_INTERRUPTS(CLBR_NONE)
524 pushq %rdi 516 pushq_cfi %rdi
525 CFI_ADJUST_CFA_OFFSET 8
526 call schedule 517 call schedule
527 popq %rdi 518 popq_cfi %rdi
528 CFI_ADJUST_CFA_OFFSET -8
529 jmp sysret_check 519 jmp sysret_check
530 520
531 /* Handle a signal */ 521 /* Handle a signal */
@@ -634,11 +624,9 @@ int_careful:
634 jnc int_very_careful 624 jnc int_very_careful
635 TRACE_IRQS_ON 625 TRACE_IRQS_ON
636 ENABLE_INTERRUPTS(CLBR_NONE) 626 ENABLE_INTERRUPTS(CLBR_NONE)
637 pushq %rdi 627 pushq_cfi %rdi
638 CFI_ADJUST_CFA_OFFSET 8
639 call schedule 628 call schedule
640 popq %rdi 629 popq_cfi %rdi
641 CFI_ADJUST_CFA_OFFSET -8
642 DISABLE_INTERRUPTS(CLBR_NONE) 630 DISABLE_INTERRUPTS(CLBR_NONE)
643 TRACE_IRQS_OFF 631 TRACE_IRQS_OFF
644 jmp int_with_check 632 jmp int_with_check
@@ -652,12 +640,10 @@ int_check_syscall_exit_work:
652 /* Check for syscall exit trace */ 640 /* Check for syscall exit trace */
653 testl $_TIF_WORK_SYSCALL_EXIT,%edx 641 testl $_TIF_WORK_SYSCALL_EXIT,%edx
654 jz int_signal 642 jz int_signal
655 pushq %rdi 643 pushq_cfi %rdi
656 CFI_ADJUST_CFA_OFFSET 8
657 leaq 8(%rsp),%rdi # &ptregs -> arg1 644 leaq 8(%rsp),%rdi # &ptregs -> arg1
658 call syscall_trace_leave 645 call syscall_trace_leave
659 popq %rdi 646 popq_cfi %rdi
660 CFI_ADJUST_CFA_OFFSET -8
661 andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi 647 andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
662 jmp int_restore_rest 648 jmp int_restore_rest
663 649
@@ -714,9 +700,8 @@ END(ptregscall_common)
714 700
715ENTRY(stub_execve) 701ENTRY(stub_execve)
716 CFI_STARTPROC 702 CFI_STARTPROC
717 popq %r11 703 addq $8, %rsp
718 CFI_ADJUST_CFA_OFFSET -8 704 PARTIAL_FRAME 0
719 CFI_REGISTER rip, r11
720 SAVE_REST 705 SAVE_REST
721 FIXUP_TOP_OF_STACK %r11 706 FIXUP_TOP_OF_STACK %r11
722 movq %rsp, %rcx 707 movq %rsp, %rcx
@@ -735,7 +720,7 @@ END(stub_execve)
735ENTRY(stub_rt_sigreturn) 720ENTRY(stub_rt_sigreturn)
736 CFI_STARTPROC 721 CFI_STARTPROC
737 addq $8, %rsp 722 addq $8, %rsp
738 CFI_ADJUST_CFA_OFFSET -8 723 PARTIAL_FRAME 0
739 SAVE_REST 724 SAVE_REST
740 movq %rsp,%rdi 725 movq %rsp,%rdi
741 FIXUP_TOP_OF_STACK %r11 726 FIXUP_TOP_OF_STACK %r11
@@ -766,8 +751,7 @@ vector=FIRST_EXTERNAL_VECTOR
766 .if vector <> FIRST_EXTERNAL_VECTOR 751 .if vector <> FIRST_EXTERNAL_VECTOR
767 CFI_ADJUST_CFA_OFFSET -8 752 CFI_ADJUST_CFA_OFFSET -8
768 .endif 753 .endif
7691: pushq $(~vector+0x80) /* Note: always in signed byte range */ 7541: pushq_cfi $(~vector+0x80) /* Note: always in signed byte range */
770 CFI_ADJUST_CFA_OFFSET 8
771 .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6 755 .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
772 jmp 2f 756 jmp 2f
773 .endif 757 .endif
@@ -796,8 +780,8 @@ END(interrupt)
796 780
797/* 0(%rsp): ~(interrupt number) */ 781/* 0(%rsp): ~(interrupt number) */
798 .macro interrupt func 782 .macro interrupt func
799 subq $10*8, %rsp 783 subq $ORIG_RAX-ARGOFFSET+8, %rsp
800 CFI_ADJUST_CFA_OFFSET 10*8 784 CFI_ADJUST_CFA_OFFSET ORIG_RAX-ARGOFFSET+8
801 call save_args 785 call save_args
802 PARTIAL_FRAME 0 786 PARTIAL_FRAME 0
803 call \func 787 call \func
@@ -822,6 +806,7 @@ ret_from_intr:
822 TRACE_IRQS_OFF 806 TRACE_IRQS_OFF
823 decl PER_CPU_VAR(irq_count) 807 decl PER_CPU_VAR(irq_count)
824 leaveq 808 leaveq
809 CFI_RESTORE rbp
825 CFI_DEF_CFA_REGISTER rsp 810 CFI_DEF_CFA_REGISTER rsp
826 CFI_ADJUST_CFA_OFFSET -8 811 CFI_ADJUST_CFA_OFFSET -8
827exit_intr: 812exit_intr:
@@ -903,11 +888,9 @@ retint_careful:
903 jnc retint_signal 888 jnc retint_signal
904 TRACE_IRQS_ON 889 TRACE_IRQS_ON
905 ENABLE_INTERRUPTS(CLBR_NONE) 890 ENABLE_INTERRUPTS(CLBR_NONE)
906 pushq %rdi 891 pushq_cfi %rdi
907 CFI_ADJUST_CFA_OFFSET 8
908 call schedule 892 call schedule
909 popq %rdi 893 popq_cfi %rdi
910 CFI_ADJUST_CFA_OFFSET -8
911 GET_THREAD_INFO(%rcx) 894 GET_THREAD_INFO(%rcx)
912 DISABLE_INTERRUPTS(CLBR_NONE) 895 DISABLE_INTERRUPTS(CLBR_NONE)
913 TRACE_IRQS_OFF 896 TRACE_IRQS_OFF
@@ -956,8 +939,7 @@ END(common_interrupt)
956.macro apicinterrupt num sym do_sym 939.macro apicinterrupt num sym do_sym
957ENTRY(\sym) 940ENTRY(\sym)
958 INTR_FRAME 941 INTR_FRAME
959 pushq $~(\num) 942 pushq_cfi $~(\num)
960 CFI_ADJUST_CFA_OFFSET 8
961 interrupt \do_sym 943 interrupt \do_sym
962 jmp ret_from_intr 944 jmp ret_from_intr
963 CFI_ENDPROC 945 CFI_ENDPROC
@@ -1036,8 +1018,8 @@ ENTRY(\sym)
1036 INTR_FRAME 1018 INTR_FRAME
1037 PARAVIRT_ADJUST_EXCEPTION_FRAME 1019 PARAVIRT_ADJUST_EXCEPTION_FRAME
1038 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ 1020 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
1039 subq $15*8,%rsp 1021 subq $ORIG_RAX-R15, %rsp
1040 CFI_ADJUST_CFA_OFFSET 15*8 1022 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1041 call error_entry 1023 call error_entry
1042 DEFAULT_FRAME 0 1024 DEFAULT_FRAME 0
1043 movq %rsp,%rdi /* pt_regs pointer */ 1025 movq %rsp,%rdi /* pt_regs pointer */
@@ -1052,9 +1034,9 @@ END(\sym)
1052ENTRY(\sym) 1034ENTRY(\sym)
1053 INTR_FRAME 1035 INTR_FRAME
1054 PARAVIRT_ADJUST_EXCEPTION_FRAME 1036 PARAVIRT_ADJUST_EXCEPTION_FRAME
1055 pushq $-1 /* ORIG_RAX: no syscall to restart */ 1037 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
1056 CFI_ADJUST_CFA_OFFSET 8 1038 subq $ORIG_RAX-R15, %rsp
1057 subq $15*8, %rsp 1039 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1058 call save_paranoid 1040 call save_paranoid
1059 TRACE_IRQS_OFF 1041 TRACE_IRQS_OFF
1060 movq %rsp,%rdi /* pt_regs pointer */ 1042 movq %rsp,%rdi /* pt_regs pointer */
@@ -1070,9 +1052,9 @@ END(\sym)
1070ENTRY(\sym) 1052ENTRY(\sym)
1071 INTR_FRAME 1053 INTR_FRAME
1072 PARAVIRT_ADJUST_EXCEPTION_FRAME 1054 PARAVIRT_ADJUST_EXCEPTION_FRAME
1073 pushq $-1 /* ORIG_RAX: no syscall to restart */ 1055 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
1074 CFI_ADJUST_CFA_OFFSET 8 1056 subq $ORIG_RAX-R15, %rsp
1075 subq $15*8, %rsp 1057 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1076 call save_paranoid 1058 call save_paranoid
1077 TRACE_IRQS_OFF 1059 TRACE_IRQS_OFF
1078 movq %rsp,%rdi /* pt_regs pointer */ 1060 movq %rsp,%rdi /* pt_regs pointer */
@@ -1089,8 +1071,8 @@ END(\sym)
1089ENTRY(\sym) 1071ENTRY(\sym)
1090 XCPT_FRAME 1072 XCPT_FRAME
1091 PARAVIRT_ADJUST_EXCEPTION_FRAME 1073 PARAVIRT_ADJUST_EXCEPTION_FRAME
1092 subq $15*8,%rsp 1074 subq $ORIG_RAX-R15, %rsp
1093 CFI_ADJUST_CFA_OFFSET 15*8 1075 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1094 call error_entry 1076 call error_entry
1095 DEFAULT_FRAME 0 1077 DEFAULT_FRAME 0
1096 movq %rsp,%rdi /* pt_regs pointer */ 1078 movq %rsp,%rdi /* pt_regs pointer */
@@ -1107,8 +1089,8 @@ END(\sym)
1107ENTRY(\sym) 1089ENTRY(\sym)
1108 XCPT_FRAME 1090 XCPT_FRAME
1109 PARAVIRT_ADJUST_EXCEPTION_FRAME 1091 PARAVIRT_ADJUST_EXCEPTION_FRAME
1110 subq $15*8,%rsp 1092 subq $ORIG_RAX-R15, %rsp
1111 CFI_ADJUST_CFA_OFFSET 15*8 1093 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1112 call save_paranoid 1094 call save_paranoid
1113 DEFAULT_FRAME 0 1095 DEFAULT_FRAME 0
1114 TRACE_IRQS_OFF 1096 TRACE_IRQS_OFF
@@ -1139,16 +1121,14 @@ zeroentry simd_coprocessor_error do_simd_coprocessor_error
1139 /* edi: new selector */ 1121 /* edi: new selector */
1140ENTRY(native_load_gs_index) 1122ENTRY(native_load_gs_index)
1141 CFI_STARTPROC 1123 CFI_STARTPROC
1142 pushf 1124 pushfq_cfi
1143 CFI_ADJUST_CFA_OFFSET 8
1144 DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI) 1125 DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
1145 SWAPGS 1126 SWAPGS
1146gs_change: 1127gs_change:
1147 movl %edi,%gs 1128 movl %edi,%gs
11482: mfence /* workaround */ 11292: mfence /* workaround */
1149 SWAPGS 1130 SWAPGS
1150 popf 1131 popfq_cfi
1151 CFI_ADJUST_CFA_OFFSET -8
1152 ret 1132 ret
1153 CFI_ENDPROC 1133 CFI_ENDPROC
1154END(native_load_gs_index) 1134END(native_load_gs_index)
@@ -1215,8 +1195,7 @@ END(kernel_execve)
1215/* Call softirq on interrupt stack. Interrupts are off. */ 1195/* Call softirq on interrupt stack. Interrupts are off. */
1216ENTRY(call_softirq) 1196ENTRY(call_softirq)
1217 CFI_STARTPROC 1197 CFI_STARTPROC
1218 push %rbp 1198 pushq_cfi %rbp
1219 CFI_ADJUST_CFA_OFFSET 8
1220 CFI_REL_OFFSET rbp,0 1199 CFI_REL_OFFSET rbp,0
1221 mov %rsp,%rbp 1200 mov %rsp,%rbp
1222 CFI_DEF_CFA_REGISTER rbp 1201 CFI_DEF_CFA_REGISTER rbp
@@ -1225,6 +1204,7 @@ ENTRY(call_softirq)
1225 push %rbp # backlink for old unwinder 1204 push %rbp # backlink for old unwinder
1226 call __do_softirq 1205 call __do_softirq
1227 leaveq 1206 leaveq
1207 CFI_RESTORE rbp
1228 CFI_DEF_CFA_REGISTER rsp 1208 CFI_DEF_CFA_REGISTER rsp
1229 CFI_ADJUST_CFA_OFFSET -8 1209 CFI_ADJUST_CFA_OFFSET -8
1230 decl PER_CPU_VAR(irq_count) 1210 decl PER_CPU_VAR(irq_count)
@@ -1368,7 +1348,7 @@ paranoidzeroentry machine_check *machine_check_vector(%rip)
1368 1348
1369 /* ebx: no swapgs flag */ 1349 /* ebx: no swapgs flag */
1370ENTRY(paranoid_exit) 1350ENTRY(paranoid_exit)
1371 INTR_FRAME 1351 DEFAULT_FRAME
1372 DISABLE_INTERRUPTS(CLBR_NONE) 1352 DISABLE_INTERRUPTS(CLBR_NONE)
1373 TRACE_IRQS_OFF 1353 TRACE_IRQS_OFF
1374 testl %ebx,%ebx /* swapgs needed? */ 1354 testl %ebx,%ebx /* swapgs needed? */
@@ -1445,7 +1425,6 @@ error_swapgs:
1445error_sti: 1425error_sti:
1446 TRACE_IRQS_OFF 1426 TRACE_IRQS_OFF
1447 ret 1427 ret
1448 CFI_ENDPROC
1449 1428
1450/* 1429/*
1451 * There are two places in the kernel that can potentially fault with 1430 * There are two places in the kernel that can potentially fault with
@@ -1470,6 +1449,7 @@ bstep_iret:
1470 /* Fix truncated RIP */ 1449 /* Fix truncated RIP */
1471 movq %rcx,RIP+8(%rsp) 1450 movq %rcx,RIP+8(%rsp)
1472 jmp error_swapgs 1451 jmp error_swapgs
1452 CFI_ENDPROC
1473END(error_entry) 1453END(error_entry)
1474 1454
1475 1455
@@ -1498,8 +1478,8 @@ ENTRY(nmi)
1498 INTR_FRAME 1478 INTR_FRAME
1499 PARAVIRT_ADJUST_EXCEPTION_FRAME 1479 PARAVIRT_ADJUST_EXCEPTION_FRAME
1500 pushq_cfi $-1 1480 pushq_cfi $-1
1501 subq $15*8, %rsp 1481 subq $ORIG_RAX-R15, %rsp
1502 CFI_ADJUST_CFA_OFFSET 15*8 1482 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1503 call save_paranoid 1483 call save_paranoid
1504 DEFAULT_FRAME 0 1484 DEFAULT_FRAME 0
1505 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ 1485 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 7494999141b3..efaf906daf93 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -440,9 +440,9 @@ static int hpet_legacy_next_event(unsigned long delta,
440static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev); 440static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev);
441static struct hpet_dev *hpet_devs; 441static struct hpet_dev *hpet_devs;
442 442
443void hpet_msi_unmask(unsigned int irq) 443void hpet_msi_unmask(struct irq_data *data)
444{ 444{
445 struct hpet_dev *hdev = get_irq_data(irq); 445 struct hpet_dev *hdev = data->handler_data;
446 unsigned int cfg; 446 unsigned int cfg;
447 447
448 /* unmask it */ 448 /* unmask it */
@@ -451,10 +451,10 @@ void hpet_msi_unmask(unsigned int irq)
451 hpet_writel(cfg, HPET_Tn_CFG(hdev->num)); 451 hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
452} 452}
453 453
454void hpet_msi_mask(unsigned int irq) 454void hpet_msi_mask(struct irq_data *data)
455{ 455{
456 struct hpet_dev *hdev = data->handler_data;
456 unsigned int cfg; 457 unsigned int cfg;
457 struct hpet_dev *hdev = get_irq_data(irq);
458 458
459 /* mask it */ 459 /* mask it */
460 cfg = hpet_readl(HPET_Tn_CFG(hdev->num)); 460 cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
@@ -462,18 +462,14 @@ void hpet_msi_mask(unsigned int irq)
462 hpet_writel(cfg, HPET_Tn_CFG(hdev->num)); 462 hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
463} 463}
464 464
465void hpet_msi_write(unsigned int irq, struct msi_msg *msg) 465void hpet_msi_write(struct hpet_dev *hdev, struct msi_msg *msg)
466{ 466{
467 struct hpet_dev *hdev = get_irq_data(irq);
468
469 hpet_writel(msg->data, HPET_Tn_ROUTE(hdev->num)); 467 hpet_writel(msg->data, HPET_Tn_ROUTE(hdev->num));
470 hpet_writel(msg->address_lo, HPET_Tn_ROUTE(hdev->num) + 4); 468 hpet_writel(msg->address_lo, HPET_Tn_ROUTE(hdev->num) + 4);
471} 469}
472 470
473void hpet_msi_read(unsigned int irq, struct msi_msg *msg) 471void hpet_msi_read(struct hpet_dev *hdev, struct msi_msg *msg)
474{ 472{
475 struct hpet_dev *hdev = get_irq_data(irq);
476
477 msg->data = hpet_readl(HPET_Tn_ROUTE(hdev->num)); 473 msg->data = hpet_readl(HPET_Tn_ROUTE(hdev->num));
478 msg->address_lo = hpet_readl(HPET_Tn_ROUTE(hdev->num) + 4); 474 msg->address_lo = hpet_readl(HPET_Tn_ROUTE(hdev->num) + 4);
479 msg->address_hi = 0; 475 msg->address_hi = 0;
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index a46cb3522c0c..58bb239a2fd7 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -68,19 +68,22 @@ static void __cpuinit init_thread_xstate(void)
68 */ 68 */
69 69
70 if (!HAVE_HWFP) { 70 if (!HAVE_HWFP) {
71 /*
72 * Disable xsave as we do not support it if i387
73 * emulation is enabled.
74 */
75 setup_clear_cpu_cap(X86_FEATURE_XSAVE);
76 setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
71 xstate_size = sizeof(struct i387_soft_struct); 77 xstate_size = sizeof(struct i387_soft_struct);
72 return; 78 return;
73 } 79 }
74 80
75 if (cpu_has_fxsr) 81 if (cpu_has_fxsr)
76 xstate_size = sizeof(struct i387_fxsave_struct); 82 xstate_size = sizeof(struct i387_fxsave_struct);
77#ifdef CONFIG_X86_32
78 else 83 else
79 xstate_size = sizeof(struct i387_fsave_struct); 84 xstate_size = sizeof(struct i387_fsave_struct);
80#endif
81} 85}
82 86
83#ifdef CONFIG_X86_64
84/* 87/*
85 * Called at bootup to set up the initial FPU state that is later cloned 88 * Called at bootup to set up the initial FPU state that is later cloned
86 * into all processes. 89 * into all processes.
@@ -88,12 +91,21 @@ static void __cpuinit init_thread_xstate(void)
88 91
89void __cpuinit fpu_init(void) 92void __cpuinit fpu_init(void)
90{ 93{
91 unsigned long oldcr0 = read_cr0(); 94 unsigned long cr0;
92 95 unsigned long cr4_mask = 0;
93 set_in_cr4(X86_CR4_OSFXSR);
94 set_in_cr4(X86_CR4_OSXMMEXCPT);
95 96
96 write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */ 97 if (cpu_has_fxsr)
98 cr4_mask |= X86_CR4_OSFXSR;
99 if (cpu_has_xmm)
100 cr4_mask |= X86_CR4_OSXMMEXCPT;
101 if (cr4_mask)
102 set_in_cr4(cr4_mask);
103
104 cr0 = read_cr0();
105 cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */
106 if (!HAVE_HWFP)
107 cr0 |= X86_CR0_EM;
108 write_cr0(cr0);
97 109
98 if (!smp_processor_id()) 110 if (!smp_processor_id())
99 init_thread_xstate(); 111 init_thread_xstate();
@@ -104,24 +116,12 @@ void __cpuinit fpu_init(void)
104 clear_used_math(); 116 clear_used_math();
105} 117}
106 118
107#else /* CONFIG_X86_64 */
108
109void __cpuinit fpu_init(void)
110{
111 if (!smp_processor_id())
112 init_thread_xstate();
113}
114
115#endif /* CONFIG_X86_32 */
116
117void fpu_finit(struct fpu *fpu) 119void fpu_finit(struct fpu *fpu)
118{ 120{
119#ifdef CONFIG_X86_32
120 if (!HAVE_HWFP) { 121 if (!HAVE_HWFP) {
121 finit_soft_fpu(&fpu->state->soft); 122 finit_soft_fpu(&fpu->state->soft);
122 return; 123 return;
123 } 124 }
124#endif
125 125
126 if (cpu_has_fxsr) { 126 if (cpu_has_fxsr) {
127 struct i387_fxsave_struct *fx = &fpu->state->fxsave; 127 struct i387_fxsave_struct *fx = &fpu->state->fxsave;
@@ -386,19 +386,17 @@ convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
386#ifdef CONFIG_X86_64 386#ifdef CONFIG_X86_64
387 env->fip = fxsave->rip; 387 env->fip = fxsave->rip;
388 env->foo = fxsave->rdp; 388 env->foo = fxsave->rdp;
389 /*
390 * should be actually ds/cs at fpu exception time, but
391 * that information is not available in 64bit mode.
392 */
393 env->fcs = task_pt_regs(tsk)->cs;
389 if (tsk == current) { 394 if (tsk == current) {
390 /* 395 savesegment(ds, env->fos);
391 * should be actually ds/cs at fpu exception time, but
392 * that information is not available in 64bit mode.
393 */
394 asm("mov %%ds, %[fos]" : [fos] "=r" (env->fos));
395 asm("mov %%cs, %[fcs]" : [fcs] "=r" (env->fcs));
396 } else { 396 } else {
397 struct pt_regs *regs = task_pt_regs(tsk); 397 env->fos = tsk->thread.ds;
398
399 env->fos = 0xffff0000 | tsk->thread.ds;
400 env->fcs = regs->cs;
401 } 398 }
399 env->fos |= 0xffff0000;
402#else 400#else
403 env->fip = fxsave->fip; 401 env->fip = fxsave->fip;
404 env->fcs = (u16) fxsave->fcs | ((u32) fxsave->fop << 16); 402 env->fcs = (u16) fxsave->fcs | ((u32) fxsave->fop << 16);
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index cafa7c80ac95..20757cb2efa3 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -29,24 +29,10 @@
29 * plus some generic x86 specific things if generic specifics makes 29 * plus some generic x86 specific things if generic specifics makes
30 * any sense at all. 30 * any sense at all.
31 */ 31 */
32static void init_8259A(int auto_eoi);
32 33
33static int i8259A_auto_eoi; 34static int i8259A_auto_eoi;
34DEFINE_RAW_SPINLOCK(i8259A_lock); 35DEFINE_RAW_SPINLOCK(i8259A_lock);
35static void mask_and_ack_8259A(unsigned int);
36static void mask_8259A(void);
37static void unmask_8259A(void);
38static void disable_8259A_irq(unsigned int irq);
39static void enable_8259A_irq(unsigned int irq);
40static void init_8259A(int auto_eoi);
41static int i8259A_irq_pending(unsigned int irq);
42
43struct irq_chip i8259A_chip = {
44 .name = "XT-PIC",
45 .mask = disable_8259A_irq,
46 .disable = disable_8259A_irq,
47 .unmask = enable_8259A_irq,
48 .mask_ack = mask_and_ack_8259A,
49};
50 36
51/* 37/*
52 * 8259A PIC functions to handle ISA devices: 38 * 8259A PIC functions to handle ISA devices:
@@ -68,7 +54,7 @@ unsigned int cached_irq_mask = 0xffff;
68 */ 54 */
69unsigned long io_apic_irqs; 55unsigned long io_apic_irqs;
70 56
71static void disable_8259A_irq(unsigned int irq) 57static void mask_8259A_irq(unsigned int irq)
72{ 58{
73 unsigned int mask = 1 << irq; 59 unsigned int mask = 1 << irq;
74 unsigned long flags; 60 unsigned long flags;
@@ -82,7 +68,12 @@ static void disable_8259A_irq(unsigned int irq)
82 raw_spin_unlock_irqrestore(&i8259A_lock, flags); 68 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
83} 69}
84 70
85static void enable_8259A_irq(unsigned int irq) 71static void disable_8259A_irq(struct irq_data *data)
72{
73 mask_8259A_irq(data->irq);
74}
75
76static void unmask_8259A_irq(unsigned int irq)
86{ 77{
87 unsigned int mask = ~(1 << irq); 78 unsigned int mask = ~(1 << irq);
88 unsigned long flags; 79 unsigned long flags;
@@ -96,6 +87,11 @@ static void enable_8259A_irq(unsigned int irq)
96 raw_spin_unlock_irqrestore(&i8259A_lock, flags); 87 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
97} 88}
98 89
90static void enable_8259A_irq(struct irq_data *data)
91{
92 unmask_8259A_irq(data->irq);
93}
94
99static int i8259A_irq_pending(unsigned int irq) 95static int i8259A_irq_pending(unsigned int irq)
100{ 96{
101 unsigned int mask = 1<<irq; 97 unsigned int mask = 1<<irq;
@@ -117,7 +113,7 @@ static void make_8259A_irq(unsigned int irq)
117 disable_irq_nosync(irq); 113 disable_irq_nosync(irq);
118 io_apic_irqs &= ~(1<<irq); 114 io_apic_irqs &= ~(1<<irq);
119 set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq, 115 set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq,
120 "XT"); 116 i8259A_chip.name);
121 enable_irq(irq); 117 enable_irq(irq);
122} 118}
123 119
@@ -150,8 +146,9 @@ static inline int i8259A_irq_real(unsigned int irq)
150 * first, _then_ send the EOI, and the order of EOI 146 * first, _then_ send the EOI, and the order of EOI
151 * to the two 8259s is important! 147 * to the two 8259s is important!
152 */ 148 */
153static void mask_and_ack_8259A(unsigned int irq) 149static void mask_and_ack_8259A(struct irq_data *data)
154{ 150{
151 unsigned int irq = data->irq;
155 unsigned int irqmask = 1 << irq; 152 unsigned int irqmask = 1 << irq;
156 unsigned long flags; 153 unsigned long flags;
157 154
@@ -223,6 +220,14 @@ spurious_8259A_irq:
223 } 220 }
224} 221}
225 222
223struct irq_chip i8259A_chip = {
224 .name = "XT-PIC",
225 .irq_mask = disable_8259A_irq,
226 .irq_disable = disable_8259A_irq,
227 .irq_unmask = enable_8259A_irq,
228 .irq_mask_ack = mask_and_ack_8259A,
229};
230
226static char irq_trigger[2]; 231static char irq_trigger[2];
227/** 232/**
228 * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ 233 * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ
@@ -342,9 +347,9 @@ static void init_8259A(int auto_eoi)
342 * In AEOI mode we just have to mask the interrupt 347 * In AEOI mode we just have to mask the interrupt
343 * when acking. 348 * when acking.
344 */ 349 */
345 i8259A_chip.mask_ack = disable_8259A_irq; 350 i8259A_chip.irq_mask_ack = disable_8259A_irq;
346 else 351 else
347 i8259A_chip.mask_ack = mask_and_ack_8259A; 352 i8259A_chip.irq_mask_ack = mask_and_ack_8259A;
348 353
349 udelay(100); /* wait for 8259A to initialize */ 354 udelay(100); /* wait for 8259A to initialize */
350 355
@@ -363,14 +368,6 @@ static void init_8259A(int auto_eoi)
363static void legacy_pic_noop(void) { }; 368static void legacy_pic_noop(void) { };
364static void legacy_pic_uint_noop(unsigned int unused) { }; 369static void legacy_pic_uint_noop(unsigned int unused) { };
365static void legacy_pic_int_noop(int unused) { }; 370static void legacy_pic_int_noop(int unused) { };
366
367static struct irq_chip dummy_pic_chip = {
368 .name = "dummy pic",
369 .mask = legacy_pic_uint_noop,
370 .unmask = legacy_pic_uint_noop,
371 .disable = legacy_pic_uint_noop,
372 .mask_ack = legacy_pic_uint_noop,
373};
374static int legacy_pic_irq_pending_noop(unsigned int irq) 371static int legacy_pic_irq_pending_noop(unsigned int irq)
375{ 372{
376 return 0; 373 return 0;
@@ -378,7 +375,9 @@ static int legacy_pic_irq_pending_noop(unsigned int irq)
378 375
379struct legacy_pic null_legacy_pic = { 376struct legacy_pic null_legacy_pic = {
380 .nr_legacy_irqs = 0, 377 .nr_legacy_irqs = 0,
381 .chip = &dummy_pic_chip, 378 .chip = &dummy_irq_chip,
379 .mask = legacy_pic_uint_noop,
380 .unmask = legacy_pic_uint_noop,
382 .mask_all = legacy_pic_noop, 381 .mask_all = legacy_pic_noop,
383 .restore_mask = legacy_pic_noop, 382 .restore_mask = legacy_pic_noop,
384 .init = legacy_pic_int_noop, 383 .init = legacy_pic_int_noop,
@@ -389,7 +388,9 @@ struct legacy_pic null_legacy_pic = {
389struct legacy_pic default_legacy_pic = { 388struct legacy_pic default_legacy_pic = {
390 .nr_legacy_irqs = NR_IRQS_LEGACY, 389 .nr_legacy_irqs = NR_IRQS_LEGACY,
391 .chip = &i8259A_chip, 390 .chip = &i8259A_chip,
392 .mask_all = mask_8259A, 391 .mask = mask_8259A_irq,
392 .unmask = unmask_8259A_irq,
393 .mask_all = mask_8259A,
393 .restore_mask = unmask_8259A, 394 .restore_mask = unmask_8259A,
394 .init = init_8259A, 395 .init = init_8259A,
395 .irq_pending = i8259A_irq_pending, 396 .irq_pending = i8259A_irq_pending,
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 44edb03fc9ec..83ec0175f986 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -159,7 +159,7 @@ int show_interrupts(struct seq_file *p, void *v)
159 seq_printf(p, "%*d: ", prec, i); 159 seq_printf(p, "%*d: ", prec, i);
160 for_each_online_cpu(j) 160 for_each_online_cpu(j)
161 seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); 161 seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
162 seq_printf(p, " %8s", desc->chip->name); 162 seq_printf(p, " %8s", desc->irq_data.chip->name);
163 seq_printf(p, "-%-8s", desc->name); 163 seq_printf(p, "-%-8s", desc->name);
164 164
165 if (action) { 165 if (action) {
@@ -282,6 +282,7 @@ void fixup_irqs(void)
282 unsigned int irq, vector; 282 unsigned int irq, vector;
283 static int warned; 283 static int warned;
284 struct irq_desc *desc; 284 struct irq_desc *desc;
285 struct irq_data *data;
285 286
286 for_each_irq_desc(irq, desc) { 287 for_each_irq_desc(irq, desc) {
287 int break_affinity = 0; 288 int break_affinity = 0;
@@ -296,7 +297,8 @@ void fixup_irqs(void)
296 /* interrupt's are disabled at this point */ 297 /* interrupt's are disabled at this point */
297 raw_spin_lock(&desc->lock); 298 raw_spin_lock(&desc->lock);
298 299
299 affinity = desc->affinity; 300 data = &desc->irq_data;
301 affinity = data->affinity;
300 if (!irq_has_action(irq) || 302 if (!irq_has_action(irq) ||
301 cpumask_equal(affinity, cpu_online_mask)) { 303 cpumask_equal(affinity, cpu_online_mask)) {
302 raw_spin_unlock(&desc->lock); 304 raw_spin_unlock(&desc->lock);
@@ -315,16 +317,16 @@ void fixup_irqs(void)
315 affinity = cpu_all_mask; 317 affinity = cpu_all_mask;
316 } 318 }
317 319
318 if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->mask) 320 if (!(desc->status & IRQ_MOVE_PCNTXT) && data->chip->irq_mask)
319 desc->chip->mask(irq); 321 data->chip->irq_mask(data);
320 322
321 if (desc->chip->set_affinity) 323 if (data->chip->irq_set_affinity)
322 desc->chip->set_affinity(irq, affinity); 324 data->chip->irq_set_affinity(data, affinity, true);
323 else if (!(warned++)) 325 else if (!(warned++))
324 set_affinity = 0; 326 set_affinity = 0;
325 327
326 if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->unmask) 328 if (!(desc->status & IRQ_MOVE_PCNTXT) && data->chip->irq_unmask)
327 desc->chip->unmask(irq); 329 data->chip->irq_unmask(data);
328 330
329 raw_spin_unlock(&desc->lock); 331 raw_spin_unlock(&desc->lock);
330 332
@@ -355,10 +357,10 @@ void fixup_irqs(void)
355 if (irr & (1 << (vector % 32))) { 357 if (irr & (1 << (vector % 32))) {
356 irq = __get_cpu_var(vector_irq)[vector]; 358 irq = __get_cpu_var(vector_irq)[vector];
357 359
358 desc = irq_to_desc(irq); 360 data = irq_get_irq_data(irq);
359 raw_spin_lock(&desc->lock); 361 raw_spin_lock(&desc->lock);
360 if (desc->chip->retrigger) 362 if (data->chip->irq_retrigger)
361 desc->chip->retrigger(irq); 363 data->chip->irq_retrigger(data);
362 raw_spin_unlock(&desc->lock); 364 raw_spin_unlock(&desc->lock);
363 } 365 }
364 } 366 }
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 713969b9266b..c752e973958d 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -100,6 +100,8 @@ int vector_used_by_percpu_irq(unsigned int vector)
100 100
101void __init init_ISA_irqs(void) 101void __init init_ISA_irqs(void)
102{ 102{
103 struct irq_chip *chip = legacy_pic->chip;
104 const char *name = chip->name;
103 int i; 105 int i;
104 106
105#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) 107#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
@@ -107,19 +109,8 @@ void __init init_ISA_irqs(void)
107#endif 109#endif
108 legacy_pic->init(0); 110 legacy_pic->init(0);
109 111
110 /* 112 for (i = 0; i < legacy_pic->nr_legacy_irqs; i++)
111 * 16 old-style INTA-cycle interrupts: 113 set_irq_chip_and_handler_name(i, chip, handle_level_irq, name);
112 */
113 for (i = 0; i < legacy_pic->nr_legacy_irqs; i++) {
114 struct irq_desc *desc = irq_to_desc(i);
115
116 desc->status = IRQ_DISABLED;
117 desc->action = NULL;
118 desc->depth = 1;
119
120 set_irq_chip_and_handler_name(i, &i8259A_chip,
121 handle_level_irq, "XT");
122 }
123} 114}
124 115
125void __init init_IRQ(void) 116void __init init_IRQ(void)
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 035c8c529181..b3ea9db39db6 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -36,7 +36,7 @@ static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
36 if (!page) 36 if (!page)
37 goto out; 37 goto out;
38 pud = (pud_t *)page_address(page); 38 pud = (pud_t *)page_address(page);
39 memset(pud, 0, PAGE_SIZE); 39 clear_page(pud);
40 set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); 40 set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
41 } 41 }
42 pud = pud_offset(pgd, addr); 42 pud = pud_offset(pgd, addr);
@@ -45,7 +45,7 @@ static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
45 if (!page) 45 if (!page)
46 goto out; 46 goto out;
47 pmd = (pmd_t *)page_address(page); 47 pmd = (pmd_t *)page_address(page);
48 memset(pmd, 0, PAGE_SIZE); 48 clear_page(pmd);
49 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); 49 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
50 } 50 }
51 pmd = pmd_offset(pud, addr); 51 pmd = pmd_offset(pud, addr);
diff --git a/arch/x86/kernel/olpc-xo1.c b/arch/x86/kernel/olpc-xo1.c
new file mode 100644
index 000000000000..f5442c03abc3
--- /dev/null
+++ b/arch/x86/kernel/olpc-xo1.c
@@ -0,0 +1,140 @@
1/*
2 * Support for features of the OLPC XO-1 laptop
3 *
4 * Copyright (C) 2010 One Laptop per Child
5 * Copyright (C) 2006 Red Hat, Inc.
6 * Copyright (C) 2006 Advanced Micro Devices, Inc.
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 */
13
14#include <linux/module.h>
15#include <linux/pci.h>
16#include <linux/pci_ids.h>
17#include <linux/platform_device.h>
18#include <linux/pm.h>
19
20#include <asm/io.h>
21#include <asm/olpc.h>
22
23#define DRV_NAME "olpc-xo1"
24
25#define PMS_BAR 4
26#define ACPI_BAR 5
27
28/* PMC registers (PMS block) */
29#define PM_SCLK 0x10
30#define PM_IN_SLPCTL 0x20
31#define PM_WKXD 0x34
32#define PM_WKD 0x30
33#define PM_SSC 0x54
34
35/* PM registers (ACPI block) */
36#define PM1_CNT 0x08
37#define PM_GPE0_STS 0x18
38
39static unsigned long acpi_base;
40static unsigned long pms_base;
41
42static void xo1_power_off(void)
43{
44 printk(KERN_INFO "OLPC XO-1 power off sequence...\n");
45
46 /* Enable all of these controls with 0 delay */
47 outl(0x40000000, pms_base + PM_SCLK);
48 outl(0x40000000, pms_base + PM_IN_SLPCTL);
49 outl(0x40000000, pms_base + PM_WKXD);
50 outl(0x40000000, pms_base + PM_WKD);
51
52 /* Clear status bits (possibly unnecessary) */
53 outl(0x0002ffff, pms_base + PM_SSC);
54 outl(0xffffffff, acpi_base + PM_GPE0_STS);
55
56 /* Write SLP_EN bit to start the machinery */
57 outl(0x00002000, acpi_base + PM1_CNT);
58}
59
60/* Read the base addresses from the PCI BAR info */
61static int __devinit setup_bases(struct pci_dev *pdev)
62{
63 int r;
64
65 r = pci_enable_device_io(pdev);
66 if (r) {
67 dev_err(&pdev->dev, "can't enable device IO\n");
68 return r;
69 }
70
71 r = pci_request_region(pdev, ACPI_BAR, DRV_NAME);
72 if (r) {
73 dev_err(&pdev->dev, "can't alloc PCI BAR #%d\n", ACPI_BAR);
74 return r;
75 }
76
77 r = pci_request_region(pdev, PMS_BAR, DRV_NAME);
78 if (r) {
79 dev_err(&pdev->dev, "can't alloc PCI BAR #%d\n", PMS_BAR);
80 pci_release_region(pdev, ACPI_BAR);
81 return r;
82 }
83
84 acpi_base = pci_resource_start(pdev, ACPI_BAR);
85 pms_base = pci_resource_start(pdev, PMS_BAR);
86
87 return 0;
88}
89
90static int __devinit olpc_xo1_probe(struct platform_device *pdev)
91{
92 struct pci_dev *pcidev;
93 int r;
94
95 pcidev = pci_get_device(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA,
96 NULL);
97 if (!pdev)
98 return -ENODEV;
99
100 r = setup_bases(pcidev);
101 if (r)
102 return r;
103
104 pm_power_off = xo1_power_off;
105
106 printk(KERN_INFO "OLPC XO-1 support registered\n");
107 return 0;
108}
109
110static int __devexit olpc_xo1_remove(struct platform_device *pdev)
111{
112 pm_power_off = NULL;
113 return 0;
114}
115
116static struct platform_driver olpc_xo1_driver = {
117 .driver = {
118 .name = DRV_NAME,
119 .owner = THIS_MODULE,
120 },
121 .probe = olpc_xo1_probe,
122 .remove = __devexit_p(olpc_xo1_remove),
123};
124
125static int __init olpc_xo1_init(void)
126{
127 return platform_driver_register(&olpc_xo1_driver);
128}
129
130static void __exit olpc_xo1_exit(void)
131{
132 platform_driver_unregister(&olpc_xo1_driver);
133}
134
135MODULE_AUTHOR("Daniel Drake <dsd@laptop.org>");
136MODULE_LICENSE("GPL");
137MODULE_ALIAS("platform:olpc-xo1");
138
139module_init(olpc_xo1_init);
140module_exit(olpc_xo1_exit);
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c
index 0e0cdde519be..edaf3fe8dc5e 100644
--- a/arch/x86/kernel/olpc.c
+++ b/arch/x86/kernel/olpc.c
@@ -17,6 +17,7 @@
17#include <linux/spinlock.h> 17#include <linux/spinlock.h>
18#include <linux/io.h> 18#include <linux/io.h>
19#include <linux/string.h> 19#include <linux/string.h>
20#include <linux/platform_device.h>
20 21
21#include <asm/geode.h> 22#include <asm/geode.h>
22#include <asm/setup.h> 23#include <asm/setup.h>
@@ -114,6 +115,7 @@ int olpc_ec_cmd(unsigned char cmd, unsigned char *inbuf, size_t inlen,
114 unsigned long flags; 115 unsigned long flags;
115 int ret = -EIO; 116 int ret = -EIO;
116 int i; 117 int i;
118 int restarts = 0;
117 119
118 spin_lock_irqsave(&ec_lock, flags); 120 spin_lock_irqsave(&ec_lock, flags);
119 121
@@ -169,7 +171,9 @@ restart:
169 if (wait_on_obf(0x6c, 1)) { 171 if (wait_on_obf(0x6c, 1)) {
170 printk(KERN_ERR "olpc-ec: timeout waiting for" 172 printk(KERN_ERR "olpc-ec: timeout waiting for"
171 " EC to provide data!\n"); 173 " EC to provide data!\n");
172 goto restart; 174 if (restarts++ < 10)
175 goto restart;
176 goto err;
173 } 177 }
174 outbuf[i] = inb(0x68); 178 outbuf[i] = inb(0x68);
175 pr_devel("olpc-ec: received 0x%x\n", outbuf[i]); 179 pr_devel("olpc-ec: received 0x%x\n", outbuf[i]);
@@ -183,8 +187,21 @@ err:
183} 187}
184EXPORT_SYMBOL_GPL(olpc_ec_cmd); 188EXPORT_SYMBOL_GPL(olpc_ec_cmd);
185 189
186#ifdef CONFIG_OLPC_OPENFIRMWARE 190static bool __init check_ofw_architecture(void)
187static void __init platform_detect(void) 191{
192 size_t propsize;
193 char olpc_arch[5];
194 const void *args[] = { NULL, "architecture", olpc_arch, (void *)5 };
195 void *res[] = { &propsize };
196
197 if (olpc_ofw("getprop", args, res)) {
198 printk(KERN_ERR "ofw: getprop call failed!\n");
199 return false;
200 }
201 return propsize == 5 && strncmp("OLPC", olpc_arch, 5) == 0;
202}
203
204static u32 __init get_board_revision(void)
188{ 205{
189 size_t propsize; 206 size_t propsize;
190 __be32 rev; 207 __be32 rev;
@@ -193,45 +210,43 @@ static void __init platform_detect(void)
193 210
194 if (olpc_ofw("getprop", args, res) || propsize != 4) { 211 if (olpc_ofw("getprop", args, res) || propsize != 4) {
195 printk(KERN_ERR "ofw: getprop call failed!\n"); 212 printk(KERN_ERR "ofw: getprop call failed!\n");
196 rev = cpu_to_be32(0); 213 return cpu_to_be32(0);
197 } 214 }
198 olpc_platform_info.boardrev = be32_to_cpu(rev); 215 return be32_to_cpu(rev);
199} 216}
200#else 217
201static void __init platform_detect(void) 218static bool __init platform_detect(void)
202{ 219{
203 /* stopgap until OFW support is added to the kernel */ 220 if (!check_ofw_architecture())
204 olpc_platform_info.boardrev = olpc_board(0xc2); 221 return false;
222 olpc_platform_info.flags |= OLPC_F_PRESENT;
223 olpc_platform_info.boardrev = get_board_revision();
224 return true;
205} 225}
206#endif
207 226
208static int __init olpc_init(void) 227static int __init add_xo1_platform_devices(void)
209{ 228{
210 unsigned char *romsig; 229 struct platform_device *pdev;
211 230
212 /* The ioremap check is dangerous; limit what we run it on */ 231 pdev = platform_device_register_simple("xo1-rfkill", -1, NULL, 0);
213 if (!is_geode() || cs5535_has_vsa2()) 232 if (IS_ERR(pdev))
214 return 0; 233 return PTR_ERR(pdev);
215 234
216 spin_lock_init(&ec_lock); 235 pdev = platform_device_register_simple("olpc-xo1", -1, NULL, 0);
236 if (IS_ERR(pdev))
237 return PTR_ERR(pdev);
217 238
218 romsig = ioremap(0xffffffc0, 16); 239 return 0;
219 if (!romsig) 240}
220 return 0;
221 241
222 if (strncmp(romsig, "CL1 Q", 7)) 242static int __init olpc_init(void)
223 goto unmap; 243{
224 if (strncmp(romsig+6, romsig+13, 3)) { 244 int r = 0;
225 printk(KERN_INFO "OLPC BIOS signature looks invalid. "
226 "Assuming not OLPC\n");
227 goto unmap;
228 }
229 245
230 printk(KERN_INFO "OLPC board with OpenFirmware %.16s\n", romsig); 246 if (!olpc_ofw_present() || !platform_detect())
231 olpc_platform_info.flags |= OLPC_F_PRESENT; 247 return 0;
232 248
233 /* get the platform revision */ 249 spin_lock_init(&ec_lock);
234 platform_detect();
235 250
236 /* assume B1 and above models always have a DCON */ 251 /* assume B1 and above models always have a DCON */
237 if (olpc_board_at_least(olpc_board(0xb1))) 252 if (olpc_board_at_least(olpc_board(0xb1)))
@@ -242,8 +257,10 @@ static int __init olpc_init(void)
242 (unsigned char *) &olpc_platform_info.ecver, 1); 257 (unsigned char *) &olpc_platform_info.ecver, 1);
243 258
244#ifdef CONFIG_PCI_OLPC 259#ifdef CONFIG_PCI_OLPC
245 /* If the VSA exists let it emulate PCI, if not emulate in kernel */ 260 /* If the VSA exists let it emulate PCI, if not emulate in kernel.
246 if (!cs5535_has_vsa2()) 261 * XO-1 only. */
262 if (olpc_platform_info.boardrev < olpc_board_pre(0xd0) &&
263 !cs5535_has_vsa2())
247 x86_init.pci.arch_init = pci_olpc_init; 264 x86_init.pci.arch_init = pci_olpc_init;
248#endif 265#endif
249 266
@@ -252,8 +269,12 @@ static int __init olpc_init(void)
252 olpc_platform_info.boardrev >> 4, 269 olpc_platform_info.boardrev >> 4,
253 olpc_platform_info.ecver); 270 olpc_platform_info.ecver);
254 271
255unmap: 272 if (olpc_platform_info.boardrev < olpc_board_pre(0xd0)) { /* XO-1 */
256 iounmap(romsig); 273 r = add_xo1_platform_devices();
274 if (r)
275 return r;
276 }
277
257 return 0; 278 return 0;
258} 279}
259 280
diff --git a/arch/x86/kernel/olpc_ofw.c b/arch/x86/kernel/olpc_ofw.c
index 3218aa71ab5e..787320464379 100644
--- a/arch/x86/kernel/olpc_ofw.c
+++ b/arch/x86/kernel/olpc_ofw.c
@@ -74,6 +74,12 @@ int __olpc_ofw(const char *name, int nr_args, const void **args, int nr_res,
74} 74}
75EXPORT_SYMBOL_GPL(__olpc_ofw); 75EXPORT_SYMBOL_GPL(__olpc_ofw);
76 76
77bool olpc_ofw_present(void)
78{
79 return olpc_ofw_cif != NULL;
80}
81EXPORT_SYMBOL_GPL(olpc_ofw_present);
82
77/* OFW cif _should_ be above this address */ 83/* OFW cif _should_ be above this address */
78#define OFW_MIN 0xff000000 84#define OFW_MIN 0xff000000
79 85
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 1db183ed7c01..c5b250011fd4 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -413,7 +413,6 @@ struct pv_mmu_ops pv_mmu_ops = {
413 413
414 .alloc_pte = paravirt_nop, 414 .alloc_pte = paravirt_nop,
415 .alloc_pmd = paravirt_nop, 415 .alloc_pmd = paravirt_nop,
416 .alloc_pmd_clone = paravirt_nop,
417 .alloc_pud = paravirt_nop, 416 .alloc_pud = paravirt_nop,
418 .release_pte = paravirt_nop, 417 .release_pte = paravirt_nop,
419 .release_pmd = paravirt_nop, 418 .release_pmd = paravirt_nop,
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 0f7f130caa67..c562207b1b3d 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -39,7 +39,7 @@
39#include <asm/cacheflush.h> 39#include <asm/cacheflush.h>
40#include <asm/swiotlb.h> 40#include <asm/swiotlb.h>
41#include <asm/dma.h> 41#include <asm/dma.h>
42#include <asm/k8.h> 42#include <asm/amd_nb.h>
43#include <asm/x86_init.h> 43#include <asm/x86_init.h>
44 44
45static unsigned long iommu_bus_base; /* GART remapping area (physical) */ 45static unsigned long iommu_bus_base; /* GART remapping area (physical) */
@@ -560,8 +560,11 @@ static void enable_gart_translations(void)
560{ 560{
561 int i; 561 int i;
562 562
563 for (i = 0; i < num_k8_northbridges; i++) { 563 if (!k8_northbridges.gart_supported)
564 struct pci_dev *dev = k8_northbridges[i]; 564 return;
565
566 for (i = 0; i < k8_northbridges.num; i++) {
567 struct pci_dev *dev = k8_northbridges.nb_misc[i];
565 568
566 enable_gart_translation(dev, __pa(agp_gatt_table)); 569 enable_gart_translation(dev, __pa(agp_gatt_table));
567 } 570 }
@@ -592,16 +595,19 @@ static void gart_fixup_northbridges(struct sys_device *dev)
592 if (!fix_up_north_bridges) 595 if (!fix_up_north_bridges)
593 return; 596 return;
594 597
598 if (!k8_northbridges.gart_supported)
599 return;
600
595 pr_info("PCI-DMA: Restoring GART aperture settings\n"); 601 pr_info("PCI-DMA: Restoring GART aperture settings\n");
596 602
597 for (i = 0; i < num_k8_northbridges; i++) { 603 for (i = 0; i < k8_northbridges.num; i++) {
598 struct pci_dev *dev = k8_northbridges[i]; 604 struct pci_dev *dev = k8_northbridges.nb_misc[i];
599 605
600 /* 606 /*
601 * Don't enable translations just yet. That is the next 607 * Don't enable translations just yet. That is the next
602 * step. Restore the pre-suspend aperture settings. 608 * step. Restore the pre-suspend aperture settings.
603 */ 609 */
604 pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, aperture_order << 1); 610 gart_set_size_and_enable(dev, aperture_order);
605 pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25); 611 pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25);
606 } 612 }
607} 613}
@@ -649,8 +655,8 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
649 655
650 aper_size = aper_base = info->aper_size = 0; 656 aper_size = aper_base = info->aper_size = 0;
651 dev = NULL; 657 dev = NULL;
652 for (i = 0; i < num_k8_northbridges; i++) { 658 for (i = 0; i < k8_northbridges.num; i++) {
653 dev = k8_northbridges[i]; 659 dev = k8_northbridges.nb_misc[i];
654 new_aper_base = read_aperture(dev, &new_aper_size); 660 new_aper_base = read_aperture(dev, &new_aper_size);
655 if (!new_aper_base) 661 if (!new_aper_base)
656 goto nommu; 662 goto nommu;
@@ -718,10 +724,13 @@ static void gart_iommu_shutdown(void)
718 if (!no_agp) 724 if (!no_agp)
719 return; 725 return;
720 726
721 for (i = 0; i < num_k8_northbridges; i++) { 727 if (!k8_northbridges.gart_supported)
728 return;
729
730 for (i = 0; i < k8_northbridges.num; i++) {
722 u32 ctl; 731 u32 ctl;
723 732
724 dev = k8_northbridges[i]; 733 dev = k8_northbridges.nb_misc[i];
725 pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl); 734 pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl);
726 735
727 ctl &= ~GARTEN; 736 ctl &= ~GARTEN;
@@ -739,7 +748,7 @@ int __init gart_iommu_init(void)
739 unsigned long scratch; 748 unsigned long scratch;
740 long i; 749 long i;
741 750
742 if (num_k8_northbridges == 0) 751 if (!k8_northbridges.gart_supported)
743 return 0; 752 return 0;
744 753
745#ifndef CONFIG_AGP_AMD64 754#ifndef CONFIG_AGP_AMD64
diff --git a/arch/x86/kernel/pmtimer_64.c b/arch/x86/kernel/pmtimer_64.c
deleted file mode 100644
index b112406f1996..000000000000
--- a/arch/x86/kernel/pmtimer_64.c
+++ /dev/null
@@ -1,69 +0,0 @@
1/* Ported over from i386 by AK, original copyright was:
2 *
3 * (C) Dominik Brodowski <linux@brodo.de> 2003
4 *
5 * Driver to use the Power Management Timer (PMTMR) available in some
6 * southbridges as primary timing source for the Linux kernel.
7 *
8 * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c,
9 * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4.
10 *
11 * This file is licensed under the GPL v2.
12 *
13 * Dropped all the hardware bug workarounds for now. Hopefully they
14 * are not needed on 64bit chipsets.
15 */
16
17#include <linux/jiffies.h>
18#include <linux/kernel.h>
19#include <linux/time.h>
20#include <linux/init.h>
21#include <linux/cpumask.h>
22#include <linux/acpi_pmtmr.h>
23
24#include <asm/io.h>
25#include <asm/proto.h>
26#include <asm/msr.h>
27#include <asm/vsyscall.h>
28
29static inline u32 cyc2us(u32 cycles)
30{
31 /* The Power Management Timer ticks at 3.579545 ticks per microsecond.
32 * 1 / PM_TIMER_FREQUENCY == 0.27936511 =~ 286/1024 [error: 0.024%]
33 *
34 * Even with HZ = 100, delta is at maximum 35796 ticks, so it can
35 * easily be multiplied with 286 (=0x11E) without having to fear
36 * u32 overflows.
37 */
38 cycles *= 286;
39 return (cycles >> 10);
40}
41
42static unsigned pmtimer_wait_tick(void)
43{
44 u32 a, b;
45 for (a = b = inl(pmtmr_ioport) & ACPI_PM_MASK;
46 a == b;
47 b = inl(pmtmr_ioport) & ACPI_PM_MASK)
48 cpu_relax();
49 return b;
50}
51
52/* note: wait time is rounded up to one tick */
53void pmtimer_wait(unsigned us)
54{
55 u32 a, b;
56 a = pmtimer_wait_tick();
57 do {
58 b = inl(pmtmr_ioport);
59 cpu_relax();
60 } while (cyc2us(b - a) < us);
61}
62
63static int __init nopmtimer_setup(char *s)
64{
65 pmtmr_ioport = 0;
66 return 1;
67}
68
69__setup("nopmtimer", nopmtimer_setup);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 3d9ea531ddd1..b3d7a3a04f38 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -424,7 +424,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
424 load_TLS(next, cpu); 424 load_TLS(next, cpu);
425 425
426 /* Must be after DS reload */ 426 /* Must be after DS reload */
427 unlazy_fpu(prev_p); 427 __unlazy_fpu(prev_p);
428 428
429 /* Make sure cpu is ready for new context */ 429 /* Make sure cpu is ready for new context */
430 if (preload_fpu) 430 if (preload_fpu)
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index e3af342fe83a..7a4cf14223ba 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -84,7 +84,7 @@ static int __init reboot_setup(char *str)
84 } 84 }
85 /* we will leave sorting out the final value 85 /* we will leave sorting out the final value
86 when we are ready to reboot, since we might not 86 when we are ready to reboot, since we might not
87 have set up boot_cpu_id or smp_num_cpu */ 87 have detected BSP APIC ID or smp_num_cpu */
88 break; 88 break;
89#endif /* CONFIG_SMP */ 89#endif /* CONFIG_SMP */
90 90
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 00e167870f71..a59f6a6df5e2 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -83,7 +83,6 @@
83#include <asm/dmi.h> 83#include <asm/dmi.h>
84#include <asm/io_apic.h> 84#include <asm/io_apic.h>
85#include <asm/ist.h> 85#include <asm/ist.h>
86#include <asm/vmi.h>
87#include <asm/setup_arch.h> 86#include <asm/setup_arch.h>
88#include <asm/bios_ebda.h> 87#include <asm/bios_ebda.h>
89#include <asm/cacheflush.h> 88#include <asm/cacheflush.h>
@@ -107,7 +106,7 @@
107#include <asm/percpu.h> 106#include <asm/percpu.h>
108#include <asm/topology.h> 107#include <asm/topology.h>
109#include <asm/apicdef.h> 108#include <asm/apicdef.h>
110#include <asm/k8.h> 109#include <asm/amd_nb.h>
111#ifdef CONFIG_X86_64 110#ifdef CONFIG_X86_64
112#include <asm/numa_64.h> 111#include <asm/numa_64.h>
113#endif 112#endif
@@ -126,7 +125,6 @@ unsigned long max_pfn_mapped;
126RESERVE_BRK(dmi_alloc, 65536); 125RESERVE_BRK(dmi_alloc, 65536);
127#endif 126#endif
128 127
129unsigned int boot_cpu_id __read_mostly;
130 128
131static __initdata unsigned long _brk_start = (unsigned long)__brk_base; 129static __initdata unsigned long _brk_start = (unsigned long)__brk_base;
132unsigned long _brk_end = (unsigned long)__brk_base; 130unsigned long _brk_end = (unsigned long)__brk_base;
@@ -619,79 +617,7 @@ static __init void reserve_ibft_region(void)
619 reserve_early_overlap_ok(addr, addr + size, "ibft"); 617 reserve_early_overlap_ok(addr, addr + size, "ibft");
620} 618}
621 619
622#ifdef CONFIG_X86_RESERVE_LOW_64K 620static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10;
623static int __init dmi_low_memory_corruption(const struct dmi_system_id *d)
624{
625 printk(KERN_NOTICE
626 "%s detected: BIOS may corrupt low RAM, working around it.\n",
627 d->ident);
628
629 e820_update_range(0, 0x10000, E820_RAM, E820_RESERVED);
630 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
631
632 return 0;
633}
634#endif
635
636/* List of systems that have known low memory corruption BIOS problems */
637static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
638#ifdef CONFIG_X86_RESERVE_LOW_64K
639 {
640 .callback = dmi_low_memory_corruption,
641 .ident = "AMI BIOS",
642 .matches = {
643 DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."),
644 },
645 },
646 {
647 .callback = dmi_low_memory_corruption,
648 .ident = "Phoenix BIOS",
649 .matches = {
650 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies"),
651 },
652 },
653 {
654 .callback = dmi_low_memory_corruption,
655 .ident = "Phoenix/MSC BIOS",
656 .matches = {
657 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix/MSC"),
658 },
659 },
660 /*
661 * AMI BIOS with low memory corruption was found on Intel DG45ID and
662 * DG45FC boards.
663 * It has a different DMI_BIOS_VENDOR = "Intel Corp.", for now we will
664 * match only DMI_BOARD_NAME and see if there is more bad products
665 * with this vendor.
666 */
667 {
668 .callback = dmi_low_memory_corruption,
669 .ident = "AMI BIOS",
670 .matches = {
671 DMI_MATCH(DMI_BOARD_NAME, "DG45ID"),
672 },
673 },
674 {
675 .callback = dmi_low_memory_corruption,
676 .ident = "AMI BIOS",
677 .matches = {
678 DMI_MATCH(DMI_BOARD_NAME, "DG45FC"),
679 },
680 },
681 /*
682 * The Dell Inspiron Mini 1012 has DMI_BIOS_VENDOR = "Dell Inc.", so
683 * match on the product name.
684 */
685 {
686 .callback = dmi_low_memory_corruption,
687 .ident = "Phoenix BIOS",
688 .matches = {
689 DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 1012"),
690 },
691 },
692#endif
693 {}
694};
695 621
696static void __init trim_bios_range(void) 622static void __init trim_bios_range(void)
697{ 623{
@@ -699,8 +625,14 @@ static void __init trim_bios_range(void)
699 * A special case is the first 4Kb of memory; 625 * A special case is the first 4Kb of memory;
700 * This is a BIOS owned area, not kernel ram, but generally 626 * This is a BIOS owned area, not kernel ram, but generally
701 * not listed as such in the E820 table. 627 * not listed as such in the E820 table.
628 *
629 * This typically reserves additional memory (64KiB by default)
630 * since some BIOSes are known to corrupt low memory. See the
631 * Kconfig help text for X86_RESERVE_LOW.
702 */ 632 */
703 e820_update_range(0, PAGE_SIZE, E820_RAM, E820_RESERVED); 633 e820_update_range(0, ALIGN(reserve_low, PAGE_SIZE),
634 E820_RAM, E820_RESERVED);
635
704 /* 636 /*
705 * special case: Some BIOSen report the PC BIOS 637 * special case: Some BIOSen report the PC BIOS
706 * area (640->1Mb) as ram even though it is not. 638 * area (640->1Mb) as ram even though it is not.
@@ -710,6 +642,28 @@ static void __init trim_bios_range(void)
710 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); 642 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
711} 643}
712 644
645static int __init parse_reservelow(char *p)
646{
647 unsigned long long size;
648
649 if (!p)
650 return -EINVAL;
651
652 size = memparse(p, &p);
653
654 if (size < 4096)
655 size = 4096;
656
657 if (size > 640*1024)
658 size = 640*1024;
659
660 reserve_low = size;
661
662 return 0;
663}
664
665early_param("reservelow", parse_reservelow);
666
713/* 667/*
714 * Determine if we were loaded by an EFI loader. If so, then we have also been 668 * Determine if we were loaded by an EFI loader. If so, then we have also been
715 * passed the efi memmap, systab, etc., so we should use these data structures 669 * passed the efi memmap, systab, etc., so we should use these data structures
@@ -736,10 +690,10 @@ void __init setup_arch(char **cmdline_p)
736 printk(KERN_INFO "Command line: %s\n", boot_command_line); 690 printk(KERN_INFO "Command line: %s\n", boot_command_line);
737#endif 691#endif
738 692
739 /* VMI may relocate the fixmap; do this before touching ioremap area */ 693 /*
740 vmi_init(); 694 * If we have OLPC OFW, we might end up relocating the fixmap due to
741 695 * reserve_top(), so do this before touching the ioremap area.
742 /* OFW also may relocate the fixmap */ 696 */
743 olpc_ofw_detect(); 697 olpc_ofw_detect();
744 698
745 early_trap_init(); 699 early_trap_init();
@@ -840,9 +794,6 @@ void __init setup_arch(char **cmdline_p)
840 794
841 x86_report_nx(); 795 x86_report_nx();
842 796
843 /* Must be before kernel pagetables are setup */
844 vmi_activate();
845
846 /* after early param, so could get panic from serial */ 797 /* after early param, so could get panic from serial */
847 reserve_early_setup_data(); 798 reserve_early_setup_data();
848 799
@@ -865,8 +816,6 @@ void __init setup_arch(char **cmdline_p)
865 816
866 dmi_scan_machine(); 817 dmi_scan_machine();
867 818
868 dmi_check_system(bad_bios_dmi_table);
869
870 /* 819 /*
871 * VMware detection requires dmi to be available, so this 820 * VMware detection requires dmi to be available, so this
872 * needs to be done after dmi_scan_machine, for the BP. 821 * needs to be done after dmi_scan_machine, for the BP.
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index a60df9ae6454..2335c15c93a4 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -253,7 +253,7 @@ void __init setup_per_cpu_areas(void)
253 * Up to this point, the boot CPU has been using .init.data 253 * Up to this point, the boot CPU has been using .init.data
254 * area. Reload any changed state for the boot CPU. 254 * area. Reload any changed state for the boot CPU.
255 */ 255 */
256 if (cpu == boot_cpu_id) 256 if (!cpu)
257 switch_to_new_gdt(cpu); 257 switch_to_new_gdt(cpu);
258 } 258 }
259 259
diff --git a/arch/x86/kernel/sfi.c b/arch/x86/kernel/sfi.c
index cb22acf3ed09..dd4c281ffe57 100644
--- a/arch/x86/kernel/sfi.c
+++ b/arch/x86/kernel/sfi.c
@@ -34,7 +34,7 @@
34#ifdef CONFIG_X86_LOCAL_APIC 34#ifdef CONFIG_X86_LOCAL_APIC
35static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; 35static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
36 36
37void __init mp_sfi_register_lapic_address(unsigned long address) 37static void __init mp_sfi_register_lapic_address(unsigned long address)
38{ 38{
39 mp_lapic_addr = address; 39 mp_lapic_addr = address;
40 40
@@ -46,7 +46,7 @@ void __init mp_sfi_register_lapic_address(unsigned long address)
46} 46}
47 47
48/* All CPUs enumerated by SFI must be present and enabled */ 48/* All CPUs enumerated by SFI must be present and enabled */
49void __cpuinit mp_sfi_register_lapic(u8 id) 49static void __cpuinit mp_sfi_register_lapic(u8 id)
50{ 50{
51 if (MAX_APICS - id <= 0) { 51 if (MAX_APICS - id <= 0) {
52 pr_warning("Processor #%d invalid (max %d)\n", 52 pr_warning("Processor #%d invalid (max %d)\n",
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 8b3bfc4dd708..dfb50890b5b7 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -62,7 +62,7 @@
62#include <asm/pgtable.h> 62#include <asm/pgtable.h>
63#include <asm/tlbflush.h> 63#include <asm/tlbflush.h>
64#include <asm/mtrr.h> 64#include <asm/mtrr.h>
65#include <asm/vmi.h> 65#include <asm/mwait.h>
66#include <asm/apic.h> 66#include <asm/apic.h>
67#include <asm/setup.h> 67#include <asm/setup.h>
68#include <asm/uv/uv.h> 68#include <asm/uv/uv.h>
@@ -311,7 +311,6 @@ notrace static void __cpuinit start_secondary(void *unused)
311 __flush_tlb_all(); 311 __flush_tlb_all();
312#endif 312#endif
313 313
314 vmi_bringup();
315 cpu_init(); 314 cpu_init();
316 preempt_disable(); 315 preempt_disable();
317 smp_callin(); 316 smp_callin();
@@ -324,9 +323,9 @@ notrace static void __cpuinit start_secondary(void *unused)
324 check_tsc_sync_target(); 323 check_tsc_sync_target();
325 324
326 if (nmi_watchdog == NMI_IO_APIC) { 325 if (nmi_watchdog == NMI_IO_APIC) {
327 legacy_pic->chip->mask(0); 326 legacy_pic->mask(0);
328 enable_NMI_through_LVT0(); 327 enable_NMI_through_LVT0();
329 legacy_pic->chip->unmask(0); 328 legacy_pic->unmask(0);
330 } 329 }
331 330
332 /* This must be done before setting cpu_online_mask */ 331 /* This must be done before setting cpu_online_mask */
@@ -397,6 +396,19 @@ void __cpuinit smp_store_cpu_info(int id)
397 identify_secondary_cpu(c); 396 identify_secondary_cpu(c);
398} 397}
399 398
399static void __cpuinit link_thread_siblings(int cpu1, int cpu2)
400{
401 struct cpuinfo_x86 *c1 = &cpu_data(cpu1);
402 struct cpuinfo_x86 *c2 = &cpu_data(cpu2);
403
404 cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2));
405 cpumask_set_cpu(cpu2, cpu_sibling_mask(cpu1));
406 cpumask_set_cpu(cpu1, cpu_core_mask(cpu2));
407 cpumask_set_cpu(cpu2, cpu_core_mask(cpu1));
408 cpumask_set_cpu(cpu1, c2->llc_shared_map);
409 cpumask_set_cpu(cpu2, c1->llc_shared_map);
410}
411
400 412
401void __cpuinit set_cpu_sibling_map(int cpu) 413void __cpuinit set_cpu_sibling_map(int cpu)
402{ 414{
@@ -409,14 +421,13 @@ void __cpuinit set_cpu_sibling_map(int cpu)
409 for_each_cpu(i, cpu_sibling_setup_mask) { 421 for_each_cpu(i, cpu_sibling_setup_mask) {
410 struct cpuinfo_x86 *o = &cpu_data(i); 422 struct cpuinfo_x86 *o = &cpu_data(i);
411 423
412 if (c->phys_proc_id == o->phys_proc_id && 424 if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
413 c->cpu_core_id == o->cpu_core_id) { 425 if (c->phys_proc_id == o->phys_proc_id &&
414 cpumask_set_cpu(i, cpu_sibling_mask(cpu)); 426 c->compute_unit_id == o->compute_unit_id)
415 cpumask_set_cpu(cpu, cpu_sibling_mask(i)); 427 link_thread_siblings(cpu, i);
416 cpumask_set_cpu(i, cpu_core_mask(cpu)); 428 } else if (c->phys_proc_id == o->phys_proc_id &&
417 cpumask_set_cpu(cpu, cpu_core_mask(i)); 429 c->cpu_core_id == o->cpu_core_id) {
418 cpumask_set_cpu(i, c->llc_shared_map); 430 link_thread_siblings(cpu, i);
419 cpumask_set_cpu(cpu, o->llc_shared_map);
420 } 431 }
421 } 432 }
422 } else { 433 } else {
@@ -1109,8 +1120,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1109 } 1120 }
1110 set_cpu_sibling_map(0); 1121 set_cpu_sibling_map(0);
1111 1122
1112 enable_IR_x2apic();
1113 default_setup_apic_routing();
1114 1123
1115 if (smp_sanity_check(max_cpus) < 0) { 1124 if (smp_sanity_check(max_cpus) < 0) {
1116 printk(KERN_INFO "SMP disabled\n"); 1125 printk(KERN_INFO "SMP disabled\n");
@@ -1118,6 +1127,8 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1118 goto out; 1127 goto out;
1119 } 1128 }
1120 1129
1130 default_setup_apic_routing();
1131
1121 preempt_disable(); 1132 preempt_disable();
1122 if (read_apic_id() != boot_cpu_physical_apicid) { 1133 if (read_apic_id() != boot_cpu_physical_apicid) {
1123 panic("Boot APIC ID in local APIC unexpected (%d vs %d)", 1134 panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
@@ -1383,11 +1394,88 @@ void play_dead_common(void)
1383 local_irq_disable(); 1394 local_irq_disable();
1384} 1395}
1385 1396
1397/*
1398 * We need to flush the caches before going to sleep, lest we have
1399 * dirty data in our caches when we come back up.
1400 */
1401static inline void mwait_play_dead(void)
1402{
1403 unsigned int eax, ebx, ecx, edx;
1404 unsigned int highest_cstate = 0;
1405 unsigned int highest_subcstate = 0;
1406 int i;
1407 void *mwait_ptr;
1408
1409 if (!cpu_has(&current_cpu_data, X86_FEATURE_MWAIT))
1410 return;
1411 if (!cpu_has(&current_cpu_data, X86_FEATURE_CLFLSH))
1412 return;
1413 if (current_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
1414 return;
1415
1416 eax = CPUID_MWAIT_LEAF;
1417 ecx = 0;
1418 native_cpuid(&eax, &ebx, &ecx, &edx);
1419
1420 /*
1421 * eax will be 0 if EDX enumeration is not valid.
1422 * Initialized below to cstate, sub_cstate value when EDX is valid.
1423 */
1424 if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) {
1425 eax = 0;
1426 } else {
1427 edx >>= MWAIT_SUBSTATE_SIZE;
1428 for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
1429 if (edx & MWAIT_SUBSTATE_MASK) {
1430 highest_cstate = i;
1431 highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
1432 }
1433 }
1434 eax = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
1435 (highest_subcstate - 1);
1436 }
1437
1438 /*
1439 * This should be a memory location in a cache line which is
1440 * unlikely to be touched by other processors. The actual
1441 * content is immaterial as it is not actually modified in any way.
1442 */
1443 mwait_ptr = &current_thread_info()->flags;
1444
1445 wbinvd();
1446
1447 while (1) {
1448 /*
1449 * The CLFLUSH is a workaround for erratum AAI65 for
1450 * the Xeon 7400 series. It's not clear it is actually
1451 * needed, but it should be harmless in either case.
1452 * The WBINVD is insufficient due to the spurious-wakeup
1453 * case where we return around the loop.
1454 */
1455 clflush(mwait_ptr);
1456 __monitor(mwait_ptr, 0, 0);
1457 mb();
1458 __mwait(eax, 0);
1459 }
1460}
1461
1462static inline void hlt_play_dead(void)
1463{
1464 if (current_cpu_data.x86 >= 4)
1465 wbinvd();
1466
1467 while (1) {
1468 native_halt();
1469 }
1470}
1471
1386void native_play_dead(void) 1472void native_play_dead(void)
1387{ 1473{
1388 play_dead_common(); 1474 play_dead_common();
1389 tboot_shutdown(TB_SHUTDOWN_WFS); 1475 tboot_shutdown(TB_SHUTDOWN_WFS);
1390 wbinvd_halt(); 1476
1477 mwait_play_dead(); /* Only returns on failure */
1478 hlt_play_dead();
1391} 1479}
1392 1480
1393#else /* ... !CONFIG_HOTPLUG_CPU */ 1481#else /* ... !CONFIG_HOTPLUG_CPU */
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c
index d5e06624e34a..0b0cb5fede19 100644
--- a/arch/x86/kernel/sys_i386_32.c
+++ b/arch/x86/kernel/sys_i386_32.c
@@ -33,8 +33,8 @@ int kernel_execve(const char *filename,
33 const char *const envp[]) 33 const char *const envp[])
34{ 34{
35 long __res; 35 long __res;
36 asm volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx" 36 asm volatile ("int $0x80"
37 : "=a" (__res) 37 : "=a" (__res)
38 : "0" (__NR_execve), "ri" (filename), "c" (argv), "d" (envp) : "memory"); 38 : "0" (__NR_execve), "b" (filename), "c" (argv), "d" (envp) : "memory");
39 return __res; 39 return __res;
40} 40}
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 60788dee0f8a..d43968503dd2 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -776,21 +776,10 @@ asmlinkage void math_state_restore(void)
776} 776}
777EXPORT_SYMBOL_GPL(math_state_restore); 777EXPORT_SYMBOL_GPL(math_state_restore);
778 778
779#ifndef CONFIG_MATH_EMULATION
780void math_emulate(struct math_emu_info *info)
781{
782 printk(KERN_EMERG
783 "math-emulation not enabled and no coprocessor found.\n");
784 printk(KERN_EMERG "killing %s.\n", current->comm);
785 force_sig(SIGFPE, current);
786 schedule();
787}
788#endif /* CONFIG_MATH_EMULATION */
789
790dotraplinkage void __kprobes 779dotraplinkage void __kprobes
791do_device_not_available(struct pt_regs *regs, long error_code) 780do_device_not_available(struct pt_regs *regs, long error_code)
792{ 781{
793#ifdef CONFIG_X86_32 782#ifdef CONFIG_MATH_EMULATION
794 if (read_cr0() & X86_CR0_EM) { 783 if (read_cr0() & X86_CR0_EM) {
795 struct math_emu_info info = { }; 784 struct math_emu_info info = { };
796 785
@@ -798,12 +787,12 @@ do_device_not_available(struct pt_regs *regs, long error_code)
798 787
799 info.regs = regs; 788 info.regs = regs;
800 math_emulate(&info); 789 math_emulate(&info);
801 } else { 790 return;
802 math_state_restore(); /* interrupts still off */
803 conditional_sti(regs);
804 } 791 }
805#else 792#endif
806 math_state_restore(); 793 math_state_restore(); /* interrupts still off */
794#ifdef CONFIG_X86_32
795 conditional_sti(regs);
807#endif 796#endif
808} 797}
809 798
@@ -881,18 +870,6 @@ void __init trap_init(void)
881#endif 870#endif
882 871
883#ifdef CONFIG_X86_32 872#ifdef CONFIG_X86_32
884 if (cpu_has_fxsr) {
885 printk(KERN_INFO "Enabling fast FPU save and restore... ");
886 set_in_cr4(X86_CR4_OSFXSR);
887 printk("done.\n");
888 }
889 if (cpu_has_xmm) {
890 printk(KERN_INFO
891 "Enabling unmasked SIMD FPU exception support... ");
892 set_in_cr4(X86_CR4_OSXMMEXCPT);
893 printk("done.\n");
894 }
895
896 set_system_trap_gate(SYSCALL_VECTOR, &system_call); 873 set_system_trap_gate(SYSCALL_VECTOR, &system_call);
897 set_bit(SYSCALL_VECTOR, used_vectors); 874 set_bit(SYSCALL_VECTOR, used_vectors);
898#endif 875#endif
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 26a863a9c2a8..0c40d8b72416 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -104,10 +104,14 @@ int __init notsc_setup(char *str)
104 104
105__setup("notsc", notsc_setup); 105__setup("notsc", notsc_setup);
106 106
107static int no_sched_irq_time;
108
107static int __init tsc_setup(char *str) 109static int __init tsc_setup(char *str)
108{ 110{
109 if (!strcmp(str, "reliable")) 111 if (!strcmp(str, "reliable"))
110 tsc_clocksource_reliable = 1; 112 tsc_clocksource_reliable = 1;
113 if (!strncmp(str, "noirqtime", 9))
114 no_sched_irq_time = 1;
111 return 1; 115 return 1;
112} 116}
113 117
@@ -801,6 +805,7 @@ void mark_tsc_unstable(char *reason)
801 if (!tsc_unstable) { 805 if (!tsc_unstable) {
802 tsc_unstable = 1; 806 tsc_unstable = 1;
803 sched_clock_stable = 0; 807 sched_clock_stable = 0;
808 disable_sched_clock_irqtime();
804 printk(KERN_INFO "Marking TSC unstable due to %s\n", reason); 809 printk(KERN_INFO "Marking TSC unstable due to %s\n", reason);
805 /* Change only the rating, when not registered */ 810 /* Change only the rating, when not registered */
806 if (clocksource_tsc.mult) 811 if (clocksource_tsc.mult)
@@ -892,60 +897,6 @@ static void __init init_tsc_clocksource(void)
892 clocksource_register_khz(&clocksource_tsc, tsc_khz); 897 clocksource_register_khz(&clocksource_tsc, tsc_khz);
893} 898}
894 899
895#ifdef CONFIG_X86_64
896/*
897 * calibrate_cpu is used on systems with fixed rate TSCs to determine
898 * processor frequency
899 */
900#define TICK_COUNT 100000000
901static unsigned long __init calibrate_cpu(void)
902{
903 int tsc_start, tsc_now;
904 int i, no_ctr_free;
905 unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0;
906 unsigned long flags;
907
908 for (i = 0; i < 4; i++)
909 if (avail_to_resrv_perfctr_nmi_bit(i))
910 break;
911 no_ctr_free = (i == 4);
912 if (no_ctr_free) {
913 WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... "
914 "cpu_khz value may be incorrect.\n");
915 i = 3;
916 rdmsrl(MSR_K7_EVNTSEL3, evntsel3);
917 wrmsrl(MSR_K7_EVNTSEL3, 0);
918 rdmsrl(MSR_K7_PERFCTR3, pmc3);
919 } else {
920 reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i);
921 reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
922 }
923 local_irq_save(flags);
924 /* start measuring cycles, incrementing from 0 */
925 wrmsrl(MSR_K7_PERFCTR0 + i, 0);
926 wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76);
927 rdtscl(tsc_start);
928 do {
929 rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now);
930 tsc_now = get_cycles();
931 } while ((tsc_now - tsc_start) < TICK_COUNT);
932
933 local_irq_restore(flags);
934 if (no_ctr_free) {
935 wrmsrl(MSR_K7_EVNTSEL3, 0);
936 wrmsrl(MSR_K7_PERFCTR3, pmc3);
937 wrmsrl(MSR_K7_EVNTSEL3, evntsel3);
938 } else {
939 release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
940 release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
941 }
942
943 return pmc_now * tsc_khz / (tsc_now - tsc_start);
944}
945#else
946static inline unsigned long calibrate_cpu(void) { return cpu_khz; }
947#endif
948
949void __init tsc_init(void) 900void __init tsc_init(void)
950{ 901{
951 u64 lpj; 902 u64 lpj;
@@ -964,10 +915,6 @@ void __init tsc_init(void)
964 return; 915 return;
965 } 916 }
966 917
967 if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) &&
968 (boot_cpu_data.x86_vendor == X86_VENDOR_AMD))
969 cpu_khz = calibrate_cpu();
970
971 printk("Detected %lu.%03lu MHz processor.\n", 918 printk("Detected %lu.%03lu MHz processor.\n",
972 (unsigned long)cpu_khz / 1000, 919 (unsigned long)cpu_khz / 1000,
973 (unsigned long)cpu_khz % 1000); 920 (unsigned long)cpu_khz % 1000);
@@ -987,6 +934,9 @@ void __init tsc_init(void)
987 /* now allow native_sched_clock() to use rdtsc */ 934 /* now allow native_sched_clock() to use rdtsc */
988 tsc_disabled = 0; 935 tsc_disabled = 0;
989 936
937 if (!no_sched_irq_time)
938 enable_sched_clock_irqtime();
939
990 lpj = ((u64)tsc_khz * 1000); 940 lpj = ((u64)tsc_khz * 1000);
991 do_div(lpj, HZ); 941 do_div(lpj, HZ);
992 lpj_fine = lpj; 942 lpj_fine = lpj;
diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c
index 1132129db792..7b24460917d5 100644
--- a/arch/x86/kernel/uv_irq.c
+++ b/arch/x86/kernel/uv_irq.c
@@ -28,34 +28,21 @@ struct uv_irq_2_mmr_pnode{
28static spinlock_t uv_irq_lock; 28static spinlock_t uv_irq_lock;
29static struct rb_root uv_irq_root; 29static struct rb_root uv_irq_root;
30 30
31static int uv_set_irq_affinity(unsigned int, const struct cpumask *); 31static int uv_set_irq_affinity(struct irq_data *, const struct cpumask *, bool);
32 32
33static void uv_noop(unsigned int irq) 33static void uv_noop(struct irq_data *data) { }
34{
35}
36
37static unsigned int uv_noop_ret(unsigned int irq)
38{
39 return 0;
40}
41 34
42static void uv_ack_apic(unsigned int irq) 35static void uv_ack_apic(struct irq_data *data)
43{ 36{
44 ack_APIC_irq(); 37 ack_APIC_irq();
45} 38}
46 39
47static struct irq_chip uv_irq_chip = { 40static struct irq_chip uv_irq_chip = {
48 .name = "UV-CORE", 41 .name = "UV-CORE",
49 .startup = uv_noop_ret, 42 .irq_mask = uv_noop,
50 .shutdown = uv_noop, 43 .irq_unmask = uv_noop,
51 .enable = uv_noop, 44 .irq_eoi = uv_ack_apic,
52 .disable = uv_noop, 45 .irq_set_affinity = uv_set_irq_affinity,
53 .ack = uv_noop,
54 .mask = uv_noop,
55 .unmask = uv_noop,
56 .eoi = uv_ack_apic,
57 .end = uv_noop,
58 .set_affinity = uv_set_irq_affinity,
59}; 46};
60 47
61/* 48/*
@@ -144,26 +131,22 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
144 unsigned long mmr_offset, int limit) 131 unsigned long mmr_offset, int limit)
145{ 132{
146 const struct cpumask *eligible_cpu = cpumask_of(cpu); 133 const struct cpumask *eligible_cpu = cpumask_of(cpu);
147 struct irq_desc *desc = irq_to_desc(irq); 134 struct irq_cfg *cfg = get_irq_chip_data(irq);
148 struct irq_cfg *cfg;
149 int mmr_pnode;
150 unsigned long mmr_value; 135 unsigned long mmr_value;
151 struct uv_IO_APIC_route_entry *entry; 136 struct uv_IO_APIC_route_entry *entry;
152 int err; 137 int mmr_pnode, err;
153 138
154 BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != 139 BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=
155 sizeof(unsigned long)); 140 sizeof(unsigned long));
156 141
157 cfg = irq_cfg(irq);
158
159 err = assign_irq_vector(irq, cfg, eligible_cpu); 142 err = assign_irq_vector(irq, cfg, eligible_cpu);
160 if (err != 0) 143 if (err != 0)
161 return err; 144 return err;
162 145
163 if (limit == UV_AFFINITY_CPU) 146 if (limit == UV_AFFINITY_CPU)
164 desc->status |= IRQ_NO_BALANCING; 147 irq_set_status_flags(irq, IRQ_NO_BALANCING);
165 else 148 else
166 desc->status |= IRQ_MOVE_PCNTXT; 149 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
167 150
168 set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq, 151 set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq,
169 irq_name); 152 irq_name);
@@ -206,17 +189,17 @@ static void arch_disable_uv_irq(int mmr_pnode, unsigned long mmr_offset)
206 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); 189 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
207} 190}
208 191
209static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask) 192static int
193uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask,
194 bool force)
210{ 195{
211 struct irq_desc *desc = irq_to_desc(irq); 196 struct irq_cfg *cfg = data->chip_data;
212 struct irq_cfg *cfg = desc->chip_data;
213 unsigned int dest; 197 unsigned int dest;
214 unsigned long mmr_value; 198 unsigned long mmr_value, mmr_offset;
215 struct uv_IO_APIC_route_entry *entry; 199 struct uv_IO_APIC_route_entry *entry;
216 unsigned long mmr_offset;
217 int mmr_pnode; 200 int mmr_pnode;
218 201
219 if (set_desc_affinity(desc, mask, &dest)) 202 if (__ioapic_set_affinity(data, mask, &dest))
220 return -1; 203 return -1;
221 204
222 mmr_value = 0; 205 mmr_value = 0;
@@ -231,7 +214,7 @@ static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask)
231 entry->dest = dest; 214 entry->dest = dest;
232 215
233 /* Get previously stored MMR and pnode of hub sourcing interrupts */ 216 /* Get previously stored MMR and pnode of hub sourcing interrupts */
234 if (uv_irq_2_mmr_info(irq, &mmr_offset, &mmr_pnode)) 217 if (uv_irq_2_mmr_info(data->irq, &mmr_offset, &mmr_pnode))
235 return -1; 218 return -1;
236 219
237 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); 220 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index e680ea52db9b..3371bd053b89 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -66,10 +66,7 @@ static void __init visws_time_init(void)
66} 66}
67 67
68/* Replaces the default init_ISA_irqs in the generic setup */ 68/* Replaces the default init_ISA_irqs in the generic setup */
69static void __init visws_pre_intr_init(void) 69static void __init visws_pre_intr_init(void);
70{
71 init_VISWS_APIC_irqs();
72}
73 70
74/* Quirk for machine specific memory setup. */ 71/* Quirk for machine specific memory setup. */
75 72
@@ -429,67 +426,34 @@ static int is_co_apic(unsigned int irq)
429/* 426/*
430 * This is the SGI Cobalt (IO-)APIC: 427 * This is the SGI Cobalt (IO-)APIC:
431 */ 428 */
432 429static void enable_cobalt_irq(struct irq_data *data)
433static void enable_cobalt_irq(unsigned int irq)
434{ 430{
435 co_apic_set(is_co_apic(irq), irq); 431 co_apic_set(is_co_apic(data->irq), data->irq);
436} 432}
437 433
438static void disable_cobalt_irq(unsigned int irq) 434static void disable_cobalt_irq(struct irq_data *data)
439{ 435{
440 int entry = is_co_apic(irq); 436 int entry = is_co_apic(data->irq);
441 437
442 co_apic_write(CO_APIC_LO(entry), CO_APIC_MASK); 438 co_apic_write(CO_APIC_LO(entry), CO_APIC_MASK);
443 co_apic_read(CO_APIC_LO(entry)); 439 co_apic_read(CO_APIC_LO(entry));
444} 440}
445 441
446/* 442static void ack_cobalt_irq(struct irq_data *data)
447 * "irq" really just serves to identify the device. Here is where we
448 * map this to the Cobalt APIC entry where it's physically wired.
449 * This is called via request_irq -> setup_irq -> irq_desc->startup()
450 */
451static unsigned int startup_cobalt_irq(unsigned int irq)
452{ 443{
453 unsigned long flags; 444 unsigned long flags;
454 struct irq_desc *desc = irq_to_desc(irq);
455 445
456 spin_lock_irqsave(&cobalt_lock, flags); 446 spin_lock_irqsave(&cobalt_lock, flags);
457 if ((desc->status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING))) 447 disable_cobalt_irq(data);
458 desc->status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING);
459 enable_cobalt_irq(irq);
460 spin_unlock_irqrestore(&cobalt_lock, flags);
461 return 0;
462}
463
464static void ack_cobalt_irq(unsigned int irq)
465{
466 unsigned long flags;
467
468 spin_lock_irqsave(&cobalt_lock, flags);
469 disable_cobalt_irq(irq);
470 apic_write(APIC_EOI, APIC_EIO_ACK); 448 apic_write(APIC_EOI, APIC_EIO_ACK);
471 spin_unlock_irqrestore(&cobalt_lock, flags); 449 spin_unlock_irqrestore(&cobalt_lock, flags);
472} 450}
473 451
474static void end_cobalt_irq(unsigned int irq)
475{
476 unsigned long flags;
477 struct irq_desc *desc = irq_to_desc(irq);
478
479 spin_lock_irqsave(&cobalt_lock, flags);
480 if (!(desc->status & (IRQ_DISABLED | IRQ_INPROGRESS)))
481 enable_cobalt_irq(irq);
482 spin_unlock_irqrestore(&cobalt_lock, flags);
483}
484
485static struct irq_chip cobalt_irq_type = { 452static struct irq_chip cobalt_irq_type = {
486 .name = "Cobalt-APIC", 453 .name = "Cobalt-APIC",
487 .startup = startup_cobalt_irq, 454 .irq_enable = enable_cobalt_irq,
488 .shutdown = disable_cobalt_irq, 455 .irq_disable = disable_cobalt_irq,
489 .enable = enable_cobalt_irq, 456 .irq_ack = ack_cobalt_irq,
490 .disable = disable_cobalt_irq,
491 .ack = ack_cobalt_irq,
492 .end = end_cobalt_irq,
493}; 457};
494 458
495 459
@@ -503,35 +467,34 @@ static struct irq_chip cobalt_irq_type = {
503 * interrupt controller type, and through a special virtual interrupt- 467 * interrupt controller type, and through a special virtual interrupt-
504 * controller. Device drivers only see the virtual interrupt sources. 468 * controller. Device drivers only see the virtual interrupt sources.
505 */ 469 */
506static unsigned int startup_piix4_master_irq(unsigned int irq) 470static unsigned int startup_piix4_master_irq(struct irq_data *data)
507{ 471{
508 legacy_pic->init(0); 472 legacy_pic->init(0);
509 473 enable_cobalt_irq(data);
510 return startup_cobalt_irq(irq);
511} 474}
512 475
513static void end_piix4_master_irq(unsigned int irq) 476static void end_piix4_master_irq(struct irq_data *data)
514{ 477{
515 unsigned long flags; 478 unsigned long flags;
516 479
517 spin_lock_irqsave(&cobalt_lock, flags); 480 spin_lock_irqsave(&cobalt_lock, flags);
518 enable_cobalt_irq(irq); 481 enable_cobalt_irq(data);
519 spin_unlock_irqrestore(&cobalt_lock, flags); 482 spin_unlock_irqrestore(&cobalt_lock, flags);
520} 483}
521 484
522static struct irq_chip piix4_master_irq_type = { 485static struct irq_chip piix4_master_irq_type = {
523 .name = "PIIX4-master", 486 .name = "PIIX4-master",
524 .startup = startup_piix4_master_irq, 487 .irq_startup = startup_piix4_master_irq,
525 .ack = ack_cobalt_irq, 488 .irq_ack = ack_cobalt_irq,
526 .end = end_piix4_master_irq,
527}; 489};
528 490
491static void pii4_mask(struct irq_data *data) { }
529 492
530static struct irq_chip piix4_virtual_irq_type = { 493static struct irq_chip piix4_virtual_irq_type = {
531 .name = "PIIX4-virtual", 494 .name = "PIIX4-virtual",
495 .mask = pii4_mask,
532}; 496};
533 497
534
535/* 498/*
536 * PIIX4-8259 master/virtual functions to handle interrupt requests 499 * PIIX4-8259 master/virtual functions to handle interrupt requests
537 * from legacy devices: floppy, parallel, serial, rtc. 500 * from legacy devices: floppy, parallel, serial, rtc.
@@ -549,9 +512,8 @@ static struct irq_chip piix4_virtual_irq_type = {
549 */ 512 */
550static irqreturn_t piix4_master_intr(int irq, void *dev_id) 513static irqreturn_t piix4_master_intr(int irq, void *dev_id)
551{ 514{
552 int realirq;
553 struct irq_desc *desc;
554 unsigned long flags; 515 unsigned long flags;
516 int realirq;
555 517
556 raw_spin_lock_irqsave(&i8259A_lock, flags); 518 raw_spin_lock_irqsave(&i8259A_lock, flags);
557 519
@@ -592,18 +554,10 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id)
592 554
593 raw_spin_unlock_irqrestore(&i8259A_lock, flags); 555 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
594 556
595 desc = irq_to_desc(realirq);
596
597 /* 557 /*
598 * handle this 'virtual interrupt' as a Cobalt one now. 558 * handle this 'virtual interrupt' as a Cobalt one now.
599 */ 559 */
600 kstat_incr_irqs_this_cpu(realirq, desc); 560 generic_handle_irq(realirq);
601
602 if (likely(desc->action != NULL))
603 handle_IRQ_event(realirq, desc->action);
604
605 if (!(desc->status & IRQ_DISABLED))
606 legacy_pic->chip->unmask(realirq);
607 561
608 return IRQ_HANDLED; 562 return IRQ_HANDLED;
609 563
@@ -624,41 +578,35 @@ static struct irqaction cascade_action = {
624 578
625static inline void set_piix4_virtual_irq_type(void) 579static inline void set_piix4_virtual_irq_type(void)
626{ 580{
627 piix4_virtual_irq_type.shutdown = i8259A_chip.mask;
628 piix4_virtual_irq_type.enable = i8259A_chip.unmask; 581 piix4_virtual_irq_type.enable = i8259A_chip.unmask;
629 piix4_virtual_irq_type.disable = i8259A_chip.mask; 582 piix4_virtual_irq_type.disable = i8259A_chip.mask;
583 piix4_virtual_irq_type.unmask = i8259A_chip.unmask;
630} 584}
631 585
632void init_VISWS_APIC_irqs(void) 586static void __init visws_pre_intr_init(void)
633{ 587{
634 int i; 588 int i;
635 589
636 for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) { 590 set_piix4_virtual_irq_type();
637 struct irq_desc *desc = irq_to_desc(i);
638
639 desc->status = IRQ_DISABLED;
640 desc->action = 0;
641 desc->depth = 1;
642 591
643 if (i == 0) { 592 for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) {
644 desc->chip = &cobalt_irq_type; 593 struct irq_chip *chip = NULL;
645 } 594
646 else if (i == CO_IRQ_IDE0) { 595 if (i == 0)
647 desc->chip = &cobalt_irq_type; 596 chip = &cobalt_irq_type;
648 } 597 else if (i == CO_IRQ_IDE0)
649 else if (i == CO_IRQ_IDE1) { 598 chip = &cobalt_irq_type;
650 desc->chip = &cobalt_irq_type; 599 else if (i == CO_IRQ_IDE1)
651 } 600 >chip = &cobalt_irq_type;
652 else if (i == CO_IRQ_8259) { 601 else if (i == CO_IRQ_8259)
653 desc->chip = &piix4_master_irq_type; 602 chip = &piix4_master_irq_type;
654 } 603 else if (i < CO_IRQ_APIC0)
655 else if (i < CO_IRQ_APIC0) { 604 chip = &piix4_virtual_irq_type;
656 set_piix4_virtual_irq_type(); 605 else if (IS_CO_APIC(i))
657 desc->chip = &piix4_virtual_irq_type; 606 chip = &cobalt_irq_type;
658 } 607
659 else if (IS_CO_APIC(i)) { 608 if (chip)
660 desc->chip = &cobalt_irq_type; 609 set_irq_chip(i, chip);
661 }
662 } 610 }
663 611
664 setup_irq(CO_IRQ_8259, &master_action); 612 setup_irq(CO_IRQ_8259, &master_action);
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
deleted file mode 100644
index ce9fbacb7526..000000000000
--- a/arch/x86/kernel/vmi_32.c
+++ /dev/null
@@ -1,893 +0,0 @@
1/*
2 * VMI specific paravirt-ops implementation
3 *
4 * Copyright (C) 2005, VMware, Inc.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
14 * NON INFRINGEMENT. See the GNU General Public License for more
15 * details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 * Send feedback to zach@vmware.com
22 *
23 */
24
25#include <linux/module.h>
26#include <linux/cpu.h>
27#include <linux/bootmem.h>
28#include <linux/mm.h>
29#include <linux/highmem.h>
30#include <linux/sched.h>
31#include <linux/gfp.h>
32#include <asm/vmi.h>
33#include <asm/io.h>
34#include <asm/fixmap.h>
35#include <asm/apicdef.h>
36#include <asm/apic.h>
37#include <asm/pgalloc.h>
38#include <asm/processor.h>
39#include <asm/timer.h>
40#include <asm/vmi_time.h>
41#include <asm/kmap_types.h>
42#include <asm/setup.h>
43
44/* Convenient for calling VMI functions indirectly in the ROM */
45typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void);
46typedef u64 __attribute__((regparm(2))) (VROMLONGFUNC)(int);
47
48#define call_vrom_func(rom,func) \
49 (((VROMFUNC *)(rom->func))())
50
51#define call_vrom_long_func(rom,func,arg) \
52 (((VROMLONGFUNC *)(rom->func)) (arg))
53
54static struct vrom_header *vmi_rom;
55static int disable_pge;
56static int disable_pse;
57static int disable_sep;
58static int disable_tsc;
59static int disable_mtrr;
60static int disable_noidle;
61static int disable_vmi_timer;
62
63/* Cached VMI operations */
64static struct {
65 void (*cpuid)(void /* non-c */);
66 void (*_set_ldt)(u32 selector);
67 void (*set_tr)(u32 selector);
68 void (*write_idt_entry)(struct desc_struct *, int, u32, u32);
69 void (*write_gdt_entry)(struct desc_struct *, int, u32, u32);
70 void (*write_ldt_entry)(struct desc_struct *, int, u32, u32);
71 void (*set_kernel_stack)(u32 selector, u32 sp0);
72 void (*allocate_page)(u32, u32, u32, u32, u32);
73 void (*release_page)(u32, u32);
74 void (*set_pte)(pte_t, pte_t *, unsigned);
75 void (*update_pte)(pte_t *, unsigned);
76 void (*set_linear_mapping)(int, void *, u32, u32);
77 void (*_flush_tlb)(int);
78 void (*set_initial_ap_state)(int, int);
79 void (*halt)(void);
80 void (*set_lazy_mode)(int mode);
81} vmi_ops;
82
83/* Cached VMI operations */
84struct vmi_timer_ops vmi_timer_ops;
85
86/*
87 * VMI patching routines.
88 */
89#define MNEM_CALL 0xe8
90#define MNEM_JMP 0xe9
91#define MNEM_RET 0xc3
92
93#define IRQ_PATCH_INT_MASK 0
94#define IRQ_PATCH_DISABLE 5
95
96static inline void patch_offset(void *insnbuf,
97 unsigned long ip, unsigned long dest)
98{
99 *(unsigned long *)(insnbuf+1) = dest-ip-5;
100}
101
102static unsigned patch_internal(int call, unsigned len, void *insnbuf,
103 unsigned long ip)
104{
105 u64 reloc;
106 struct vmi_relocation_info *const rel = (struct vmi_relocation_info *)&reloc;
107 reloc = call_vrom_long_func(vmi_rom, get_reloc, call);
108 switch(rel->type) {
109 case VMI_RELOCATION_CALL_REL:
110 BUG_ON(len < 5);
111 *(char *)insnbuf = MNEM_CALL;
112 patch_offset(insnbuf, ip, (unsigned long)rel->eip);
113 return 5;
114
115 case VMI_RELOCATION_JUMP_REL:
116 BUG_ON(len < 5);
117 *(char *)insnbuf = MNEM_JMP;
118 patch_offset(insnbuf, ip, (unsigned long)rel->eip);
119 return 5;
120
121 case VMI_RELOCATION_NOP:
122 /* obliterate the whole thing */
123 return 0;
124
125 case VMI_RELOCATION_NONE:
126 /* leave native code in place */
127 break;
128
129 default:
130 BUG();
131 }
132 return len;
133}
134
135/*
136 * Apply patch if appropriate, return length of new instruction
137 * sequence. The callee does nop padding for us.
138 */
139static unsigned vmi_patch(u8 type, u16 clobbers, void *insns,
140 unsigned long ip, unsigned len)
141{
142 switch (type) {
143 case PARAVIRT_PATCH(pv_irq_ops.irq_disable):
144 return patch_internal(VMI_CALL_DisableInterrupts, len,
145 insns, ip);
146 case PARAVIRT_PATCH(pv_irq_ops.irq_enable):
147 return patch_internal(VMI_CALL_EnableInterrupts, len,
148 insns, ip);
149 case PARAVIRT_PATCH(pv_irq_ops.restore_fl):
150 return patch_internal(VMI_CALL_SetInterruptMask, len,
151 insns, ip);
152 case PARAVIRT_PATCH(pv_irq_ops.save_fl):
153 return patch_internal(VMI_CALL_GetInterruptMask, len,
154 insns, ip);
155 case PARAVIRT_PATCH(pv_cpu_ops.iret):
156 return patch_internal(VMI_CALL_IRET, len, insns, ip);
157 case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit):
158 return patch_internal(VMI_CALL_SYSEXIT, len, insns, ip);
159 default:
160 break;
161 }
162 return len;
163}
164
165/* CPUID has non-C semantics, and paravirt-ops API doesn't match hardware ISA */
166static void vmi_cpuid(unsigned int *ax, unsigned int *bx,
167 unsigned int *cx, unsigned int *dx)
168{
169 int override = 0;
170 if (*ax == 1)
171 override = 1;
172 asm volatile ("call *%6"
173 : "=a" (*ax),
174 "=b" (*bx),
175 "=c" (*cx),
176 "=d" (*dx)
177 : "0" (*ax), "2" (*cx), "r" (vmi_ops.cpuid));
178 if (override) {
179 if (disable_pse)
180 *dx &= ~X86_FEATURE_PSE;
181 if (disable_pge)
182 *dx &= ~X86_FEATURE_PGE;
183 if (disable_sep)
184 *dx &= ~X86_FEATURE_SEP;
185 if (disable_tsc)
186 *dx &= ~X86_FEATURE_TSC;
187 if (disable_mtrr)
188 *dx &= ~X86_FEATURE_MTRR;
189 }
190}
191
192static inline void vmi_maybe_load_tls(struct desc_struct *gdt, int nr, struct desc_struct *new)
193{
194 if (gdt[nr].a != new->a || gdt[nr].b != new->b)
195 write_gdt_entry(gdt, nr, new, 0);
196}
197
198static void vmi_load_tls(struct thread_struct *t, unsigned int cpu)
199{
200 struct desc_struct *gdt = get_cpu_gdt_table(cpu);
201 vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 0, &t->tls_array[0]);
202 vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 1, &t->tls_array[1]);
203 vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 2, &t->tls_array[2]);
204}
205
206static void vmi_set_ldt(const void *addr, unsigned entries)
207{
208 unsigned cpu = smp_processor_id();
209 struct desc_struct desc;
210
211 pack_descriptor(&desc, (unsigned long)addr,
212 entries * sizeof(struct desc_struct) - 1,
213 DESC_LDT, 0);
214 write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, &desc, DESC_LDT);
215 vmi_ops._set_ldt(entries ? GDT_ENTRY_LDT*sizeof(struct desc_struct) : 0);
216}
217
218static void vmi_set_tr(void)
219{
220 vmi_ops.set_tr(GDT_ENTRY_TSS*sizeof(struct desc_struct));
221}
222
223static void vmi_write_idt_entry(gate_desc *dt, int entry, const gate_desc *g)
224{
225 u32 *idt_entry = (u32 *)g;
226 vmi_ops.write_idt_entry(dt, entry, idt_entry[0], idt_entry[1]);
227}
228
229static void vmi_write_gdt_entry(struct desc_struct *dt, int entry,
230 const void *desc, int type)
231{
232 u32 *gdt_entry = (u32 *)desc;
233 vmi_ops.write_gdt_entry(dt, entry, gdt_entry[0], gdt_entry[1]);
234}
235
236static void vmi_write_ldt_entry(struct desc_struct *dt, int entry,
237 const void *desc)
238{
239 u32 *ldt_entry = (u32 *)desc;
240 vmi_ops.write_ldt_entry(dt, entry, ldt_entry[0], ldt_entry[1]);
241}
242
243static void vmi_load_sp0(struct tss_struct *tss,
244 struct thread_struct *thread)
245{
246 tss->x86_tss.sp0 = thread->sp0;
247
248 /* This can only happen when SEP is enabled, no need to test "SEP"arately */
249 if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
250 tss->x86_tss.ss1 = thread->sysenter_cs;
251 wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
252 }
253 vmi_ops.set_kernel_stack(__KERNEL_DS, tss->x86_tss.sp0);
254}
255
256static void vmi_flush_tlb_user(void)
257{
258 vmi_ops._flush_tlb(VMI_FLUSH_TLB);
259}
260
261static void vmi_flush_tlb_kernel(void)
262{
263 vmi_ops._flush_tlb(VMI_FLUSH_TLB | VMI_FLUSH_GLOBAL);
264}
265
266/* Stub to do nothing at all; used for delays and unimplemented calls */
267static void vmi_nop(void)
268{
269}
270
271static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn)
272{
273 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
274}
275
276static void vmi_allocate_pmd(struct mm_struct *mm, unsigned long pfn)
277{
278 /*
279 * This call comes in very early, before mem_map is setup.
280 * It is called only for swapper_pg_dir, which already has
281 * data on it.
282 */
283 vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
284}
285
286static void vmi_allocate_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count)
287{
288 vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
289}
290
291static void vmi_release_pte(unsigned long pfn)
292{
293 vmi_ops.release_page(pfn, VMI_PAGE_L1);
294}
295
296static void vmi_release_pmd(unsigned long pfn)
297{
298 vmi_ops.release_page(pfn, VMI_PAGE_L2);
299}
300
301/*
302 * We use the pgd_free hook for releasing the pgd page:
303 */
304static void vmi_pgd_free(struct mm_struct *mm, pgd_t *pgd)
305{
306 unsigned long pfn = __pa(pgd) >> PAGE_SHIFT;
307
308 vmi_ops.release_page(pfn, VMI_PAGE_L2);
309}
310
311/*
312 * Helper macros for MMU update flags. We can defer updates until a flush
313 * or page invalidation only if the update is to the current address space
314 * (otherwise, there is no flush). We must check against init_mm, since
315 * this could be a kernel update, which usually passes init_mm, although
316 * sometimes this check can be skipped if we know the particular function
317 * is only called on user mode PTEs. We could change the kernel to pass
318 * current->active_mm here, but in particular, I was unsure if changing
319 * mm/highmem.c to do this would still be correct on other architectures.
320 */
321#define is_current_as(mm, mustbeuser) ((mm) == current->active_mm || \
322 (!mustbeuser && (mm) == &init_mm))
323#define vmi_flags_addr(mm, addr, level, user) \
324 ((level) | (is_current_as(mm, user) ? \
325 (VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
326#define vmi_flags_addr_defer(mm, addr, level, user) \
327 ((level) | (is_current_as(mm, user) ? \
328 (VMI_PAGE_DEFER | VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
329
330static void vmi_update_pte(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
331{
332 vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
333}
334
335static void vmi_update_pte_defer(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
336{
337 vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0));
338}
339
340static void vmi_set_pte(pte_t *ptep, pte_t pte)
341{
342 /* XXX because of set_pmd_pte, this can be called on PT or PD layers */
343 vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT);
344}
345
346static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
347{
348 vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
349}
350
351static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval)
352{
353#ifdef CONFIG_X86_PAE
354 const pte_t pte = { .pte = pmdval.pmd };
355#else
356 const pte_t pte = { pmdval.pud.pgd.pgd };
357#endif
358 vmi_ops.set_pte(pte, (pte_t *)pmdp, VMI_PAGE_PD);
359}
360
361#ifdef CONFIG_X86_PAE
362
363static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval)
364{
365 /*
366 * XXX This is called from set_pmd_pte, but at both PT
367 * and PD layers so the VMI_PAGE_PT flag is wrong. But
368 * it is only called for large page mapping changes,
369 * the Xen backend, doesn't support large pages, and the
370 * ESX backend doesn't depend on the flag.
371 */
372 set_64bit((unsigned long long *)ptep,pte_val(pteval));
373 vmi_ops.update_pte(ptep, VMI_PAGE_PT);
374}
375
376static void vmi_set_pud(pud_t *pudp, pud_t pudval)
377{
378 /* Um, eww */
379 const pte_t pte = { .pte = pudval.pgd.pgd };
380 vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP);
381}
382
383static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
384{
385 const pte_t pte = { .pte = 0 };
386 vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
387}
388
389static void vmi_pmd_clear(pmd_t *pmd)
390{
391 const pte_t pte = { .pte = 0 };
392 vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD);
393}
394#endif
395
396#ifdef CONFIG_SMP
397static void __devinit
398vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
399 unsigned long start_esp)
400{
401 struct vmi_ap_state ap;
402
403 /* Default everything to zero. This is fine for most GPRs. */
404 memset(&ap, 0, sizeof(struct vmi_ap_state));
405
406 ap.gdtr_limit = GDT_SIZE - 1;
407 ap.gdtr_base = (unsigned long) get_cpu_gdt_table(phys_apicid);
408
409 ap.idtr_limit = IDT_ENTRIES * 8 - 1;
410 ap.idtr_base = (unsigned long) idt_table;
411
412 ap.ldtr = 0;
413
414 ap.cs = __KERNEL_CS;
415 ap.eip = (unsigned long) start_eip;
416 ap.ss = __KERNEL_DS;
417 ap.esp = (unsigned long) start_esp;
418
419 ap.ds = __USER_DS;
420 ap.es = __USER_DS;
421 ap.fs = __KERNEL_PERCPU;
422 ap.gs = __KERNEL_STACK_CANARY;
423
424 ap.eflags = 0;
425
426#ifdef CONFIG_X86_PAE
427 /* efer should match BSP efer. */
428 if (cpu_has_nx) {
429 unsigned l, h;
430 rdmsr(MSR_EFER, l, h);
431 ap.efer = (unsigned long long) h << 32 | l;
432 }
433#endif
434
435 ap.cr3 = __pa(swapper_pg_dir);
436 /* Protected mode, paging, AM, WP, NE, MP. */
437 ap.cr0 = 0x80050023;
438 ap.cr4 = mmu_cr4_features;
439 vmi_ops.set_initial_ap_state((u32)&ap, phys_apicid);
440}
441#endif
442
443static void vmi_start_context_switch(struct task_struct *prev)
444{
445 paravirt_start_context_switch(prev);
446 vmi_ops.set_lazy_mode(2);
447}
448
449static void vmi_end_context_switch(struct task_struct *next)
450{
451 vmi_ops.set_lazy_mode(0);
452 paravirt_end_context_switch(next);
453}
454
455static void vmi_enter_lazy_mmu(void)
456{
457 paravirt_enter_lazy_mmu();
458 vmi_ops.set_lazy_mode(1);
459}
460
461static void vmi_leave_lazy_mmu(void)
462{
463 vmi_ops.set_lazy_mode(0);
464 paravirt_leave_lazy_mmu();
465}
466
467static inline int __init check_vmi_rom(struct vrom_header *rom)
468{
469 struct pci_header *pci;
470 struct pnp_header *pnp;
471 const char *manufacturer = "UNKNOWN";
472 const char *product = "UNKNOWN";
473 const char *license = "unspecified";
474
475 if (rom->rom_signature != 0xaa55)
476 return 0;
477 if (rom->vrom_signature != VMI_SIGNATURE)
478 return 0;
479 if (rom->api_version_maj != VMI_API_REV_MAJOR ||
480 rom->api_version_min+1 < VMI_API_REV_MINOR+1) {
481 printk(KERN_WARNING "VMI: Found mismatched rom version %d.%d\n",
482 rom->api_version_maj,
483 rom->api_version_min);
484 return 0;
485 }
486
487 /*
488 * Relying on the VMI_SIGNATURE field is not 100% safe, so check
489 * the PCI header and device type to make sure this is really a
490 * VMI device.
491 */
492 if (!rom->pci_header_offs) {
493 printk(KERN_WARNING "VMI: ROM does not contain PCI header.\n");
494 return 0;
495 }
496
497 pci = (struct pci_header *)((char *)rom+rom->pci_header_offs);
498 if (pci->vendorID != PCI_VENDOR_ID_VMWARE ||
499 pci->deviceID != PCI_DEVICE_ID_VMWARE_VMI) {
500 /* Allow it to run... anyways, but warn */
501 printk(KERN_WARNING "VMI: ROM from unknown manufacturer\n");
502 }
503
504 if (rom->pnp_header_offs) {
505 pnp = (struct pnp_header *)((char *)rom+rom->pnp_header_offs);
506 if (pnp->manufacturer_offset)
507 manufacturer = (const char *)rom+pnp->manufacturer_offset;
508 if (pnp->product_offset)
509 product = (const char *)rom+pnp->product_offset;
510 }
511
512 if (rom->license_offs)
513 license = (char *)rom+rom->license_offs;
514
515 printk(KERN_INFO "VMI: Found %s %s, API version %d.%d, ROM version %d.%d\n",
516 manufacturer, product,
517 rom->api_version_maj, rom->api_version_min,
518 pci->rom_version_maj, pci->rom_version_min);
519
520 /* Don't allow BSD/MIT here for now because we don't want to end up
521 with any binary only shim layers */
522 if (strcmp(license, "GPL") && strcmp(license, "GPL v2")) {
523 printk(KERN_WARNING "VMI: Non GPL license `%s' found for ROM. Not used.\n",
524 license);
525 return 0;
526 }
527
528 return 1;
529}
530
531/*
532 * Probe for the VMI option ROM
533 */
534static inline int __init probe_vmi_rom(void)
535{
536 unsigned long base;
537
538 /* VMI ROM is in option ROM area, check signature */
539 for (base = 0xC0000; base < 0xE0000; base += 2048) {
540 struct vrom_header *romstart;
541 romstart = (struct vrom_header *)isa_bus_to_virt(base);
542 if (check_vmi_rom(romstart)) {
543 vmi_rom = romstart;
544 return 1;
545 }
546 }
547 return 0;
548}
549
550/*
551 * VMI setup common to all processors
552 */
553void vmi_bringup(void)
554{
555 /* We must establish the lowmem mapping for MMU ops to work */
556 if (vmi_ops.set_linear_mapping)
557 vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, MAXMEM_PFN, 0);
558}
559
560/*
561 * Return a pointer to a VMI function or NULL if unimplemented
562 */
563static void *vmi_get_function(int vmicall)
564{
565 u64 reloc;
566 const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
567 reloc = call_vrom_long_func(vmi_rom, get_reloc, vmicall);
568 BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL);
569 if (rel->type == VMI_RELOCATION_CALL_REL)
570 return (void *)rel->eip;
571 else
572 return NULL;
573}
574
575/*
576 * Helper macro for making the VMI paravirt-ops fill code readable.
577 * For unimplemented operations, fall back to default, unless nop
578 * is returned by the ROM.
579 */
580#define para_fill(opname, vmicall) \
581do { \
582 reloc = call_vrom_long_func(vmi_rom, get_reloc, \
583 VMI_CALL_##vmicall); \
584 if (rel->type == VMI_RELOCATION_CALL_REL) \
585 opname = (void *)rel->eip; \
586 else if (rel->type == VMI_RELOCATION_NOP) \
587 opname = (void *)vmi_nop; \
588 else if (rel->type != VMI_RELOCATION_NONE) \
589 printk(KERN_WARNING "VMI: Unknown relocation " \
590 "type %d for " #vmicall"\n",\
591 rel->type); \
592} while (0)
593
594/*
595 * Helper macro for making the VMI paravirt-ops fill code readable.
596 * For cached operations which do not match the VMI ROM ABI and must
597 * go through a tranlation stub. Ignore NOPs, since it is not clear
598 * a NOP * VMI function corresponds to a NOP paravirt-op when the
599 * functions are not in 1-1 correspondence.
600 */
601#define para_wrap(opname, wrapper, cache, vmicall) \
602do { \
603 reloc = call_vrom_long_func(vmi_rom, get_reloc, \
604 VMI_CALL_##vmicall); \
605 BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL); \
606 if (rel->type == VMI_RELOCATION_CALL_REL) { \
607 opname = wrapper; \
608 vmi_ops.cache = (void *)rel->eip; \
609 } \
610} while (0)
611
612/*
613 * Activate the VMI interface and switch into paravirtualized mode
614 */
615static inline int __init activate_vmi(void)
616{
617 short kernel_cs;
618 u64 reloc;
619 const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
620
621 /*
622 * Prevent page tables from being allocated in highmem, even if
623 * CONFIG_HIGHPTE is enabled.
624 */
625 __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
626
627 if (call_vrom_func(vmi_rom, vmi_init) != 0) {
628 printk(KERN_ERR "VMI ROM failed to initialize!");
629 return 0;
630 }
631 savesegment(cs, kernel_cs);
632
633 pv_info.paravirt_enabled = 1;
634 pv_info.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK;
635 pv_info.name = "vmi [deprecated]";
636
637 pv_init_ops.patch = vmi_patch;
638
639 /*
640 * Many of these operations are ABI compatible with VMI.
641 * This means we can fill in the paravirt-ops with direct
642 * pointers into the VMI ROM. If the calling convention for
643 * these operations changes, this code needs to be updated.
644 *
645 * Exceptions
646 * CPUID paravirt-op uses pointers, not the native ISA
647 * halt has no VMI equivalent; all VMI halts are "safe"
648 * no MSR support yet - just trap and emulate. VMI uses the
649 * same ABI as the native ISA, but Linux wants exceptions
650 * from bogus MSR read / write handled
651 * rdpmc is not yet used in Linux
652 */
653
654 /* CPUID is special, so very special it gets wrapped like a present */
655 para_wrap(pv_cpu_ops.cpuid, vmi_cpuid, cpuid, CPUID);
656
657 para_fill(pv_cpu_ops.clts, CLTS);
658 para_fill(pv_cpu_ops.get_debugreg, GetDR);
659 para_fill(pv_cpu_ops.set_debugreg, SetDR);
660 para_fill(pv_cpu_ops.read_cr0, GetCR0);
661 para_fill(pv_mmu_ops.read_cr2, GetCR2);
662 para_fill(pv_mmu_ops.read_cr3, GetCR3);
663 para_fill(pv_cpu_ops.read_cr4, GetCR4);
664 para_fill(pv_cpu_ops.write_cr0, SetCR0);
665 para_fill(pv_mmu_ops.write_cr2, SetCR2);
666 para_fill(pv_mmu_ops.write_cr3, SetCR3);
667 para_fill(pv_cpu_ops.write_cr4, SetCR4);
668
669 para_fill(pv_irq_ops.save_fl.func, GetInterruptMask);
670 para_fill(pv_irq_ops.restore_fl.func, SetInterruptMask);
671 para_fill(pv_irq_ops.irq_disable.func, DisableInterrupts);
672 para_fill(pv_irq_ops.irq_enable.func, EnableInterrupts);
673
674 para_fill(pv_cpu_ops.wbinvd, WBINVD);
675 para_fill(pv_cpu_ops.read_tsc, RDTSC);
676
677 /* The following we emulate with trap and emulate for now */
678 /* paravirt_ops.read_msr = vmi_rdmsr */
679 /* paravirt_ops.write_msr = vmi_wrmsr */
680 /* paravirt_ops.rdpmc = vmi_rdpmc */
681
682 /* TR interface doesn't pass TR value, wrap */
683 para_wrap(pv_cpu_ops.load_tr_desc, vmi_set_tr, set_tr, SetTR);
684
685 /* LDT is special, too */
686 para_wrap(pv_cpu_ops.set_ldt, vmi_set_ldt, _set_ldt, SetLDT);
687
688 para_fill(pv_cpu_ops.load_gdt, SetGDT);
689 para_fill(pv_cpu_ops.load_idt, SetIDT);
690 para_fill(pv_cpu_ops.store_gdt, GetGDT);
691 para_fill(pv_cpu_ops.store_idt, GetIDT);
692 para_fill(pv_cpu_ops.store_tr, GetTR);
693 pv_cpu_ops.load_tls = vmi_load_tls;
694 para_wrap(pv_cpu_ops.write_ldt_entry, vmi_write_ldt_entry,
695 write_ldt_entry, WriteLDTEntry);
696 para_wrap(pv_cpu_ops.write_gdt_entry, vmi_write_gdt_entry,
697 write_gdt_entry, WriteGDTEntry);
698 para_wrap(pv_cpu_ops.write_idt_entry, vmi_write_idt_entry,
699 write_idt_entry, WriteIDTEntry);
700 para_wrap(pv_cpu_ops.load_sp0, vmi_load_sp0, set_kernel_stack, UpdateKernelStack);
701 para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask);
702 para_fill(pv_cpu_ops.io_delay, IODelay);
703
704 para_wrap(pv_cpu_ops.start_context_switch, vmi_start_context_switch,
705 set_lazy_mode, SetLazyMode);
706 para_wrap(pv_cpu_ops.end_context_switch, vmi_end_context_switch,
707 set_lazy_mode, SetLazyMode);
708
709 para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu,
710 set_lazy_mode, SetLazyMode);
711 para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy_mmu,
712 set_lazy_mode, SetLazyMode);
713
714 /* user and kernel flush are just handled with different flags to FlushTLB */
715 para_wrap(pv_mmu_ops.flush_tlb_user, vmi_flush_tlb_user, _flush_tlb, FlushTLB);
716 para_wrap(pv_mmu_ops.flush_tlb_kernel, vmi_flush_tlb_kernel, _flush_tlb, FlushTLB);
717 para_fill(pv_mmu_ops.flush_tlb_single, InvalPage);
718
719 /*
720 * Until a standard flag format can be agreed on, we need to
721 * implement these as wrappers in Linux. Get the VMI ROM
722 * function pointers for the two backend calls.
723 */
724#ifdef CONFIG_X86_PAE
725 vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxELong);
726 vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxELong);
727#else
728 vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxE);
729 vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxE);
730#endif
731
732 if (vmi_ops.set_pte) {
733 pv_mmu_ops.set_pte = vmi_set_pte;
734 pv_mmu_ops.set_pte_at = vmi_set_pte_at;
735 pv_mmu_ops.set_pmd = vmi_set_pmd;
736#ifdef CONFIG_X86_PAE
737 pv_mmu_ops.set_pte_atomic = vmi_set_pte_atomic;
738 pv_mmu_ops.set_pud = vmi_set_pud;
739 pv_mmu_ops.pte_clear = vmi_pte_clear;
740 pv_mmu_ops.pmd_clear = vmi_pmd_clear;
741#endif
742 }
743
744 if (vmi_ops.update_pte) {
745 pv_mmu_ops.pte_update = vmi_update_pte;
746 pv_mmu_ops.pte_update_defer = vmi_update_pte_defer;
747 }
748
749 vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage);
750 if (vmi_ops.allocate_page) {
751 pv_mmu_ops.alloc_pte = vmi_allocate_pte;
752 pv_mmu_ops.alloc_pmd = vmi_allocate_pmd;
753 pv_mmu_ops.alloc_pmd_clone = vmi_allocate_pmd_clone;
754 }
755
756 vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage);
757 if (vmi_ops.release_page) {
758 pv_mmu_ops.release_pte = vmi_release_pte;
759 pv_mmu_ops.release_pmd = vmi_release_pmd;
760 pv_mmu_ops.pgd_free = vmi_pgd_free;
761 }
762
763 /* Set linear is needed in all cases */
764 vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping);
765
766 /*
767 * These MUST always be patched. Don't support indirect jumps
768 * through these operations, as the VMI interface may use either
769 * a jump or a call to get to these operations, depending on
770 * the backend. They are performance critical anyway, so requiring
771 * a patch is not a big problem.
772 */
773 pv_cpu_ops.irq_enable_sysexit = (void *)0xfeedbab0;
774 pv_cpu_ops.iret = (void *)0xbadbab0;
775
776#ifdef CONFIG_SMP
777 para_wrap(pv_apic_ops.startup_ipi_hook, vmi_startup_ipi_hook, set_initial_ap_state, SetInitialAPState);
778#endif
779
780#ifdef CONFIG_X86_LOCAL_APIC
781 para_fill(apic->read, APICRead);
782 para_fill(apic->write, APICWrite);
783#endif
784
785 /*
786 * Check for VMI timer functionality by probing for a cycle frequency method
787 */
788 reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_GetCycleFrequency);
789 if (!disable_vmi_timer && rel->type != VMI_RELOCATION_NONE) {
790 vmi_timer_ops.get_cycle_frequency = (void *)rel->eip;
791 vmi_timer_ops.get_cycle_counter =
792 vmi_get_function(VMI_CALL_GetCycleCounter);
793 vmi_timer_ops.get_wallclock =
794 vmi_get_function(VMI_CALL_GetWallclockTime);
795 vmi_timer_ops.wallclock_updated =
796 vmi_get_function(VMI_CALL_WallclockUpdated);
797 vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm);
798 vmi_timer_ops.cancel_alarm =
799 vmi_get_function(VMI_CALL_CancelAlarm);
800 x86_init.timers.timer_init = vmi_time_init;
801#ifdef CONFIG_X86_LOCAL_APIC
802 x86_init.timers.setup_percpu_clockev = vmi_time_bsp_init;
803 x86_cpuinit.setup_percpu_clockev = vmi_time_ap_init;
804#endif
805 pv_time_ops.sched_clock = vmi_sched_clock;
806 x86_platform.calibrate_tsc = vmi_tsc_khz;
807 x86_platform.get_wallclock = vmi_get_wallclock;
808 x86_platform.set_wallclock = vmi_set_wallclock;
809
810 /* We have true wallclock functions; disable CMOS clock sync */
811 no_sync_cmos_clock = 1;
812 } else {
813 disable_noidle = 1;
814 disable_vmi_timer = 1;
815 }
816
817 para_fill(pv_irq_ops.safe_halt, Halt);
818
819 /*
820 * Alternative instruction rewriting doesn't happen soon enough
821 * to convert VMI_IRET to a call instead of a jump; so we have
822 * to do this before IRQs get reenabled. Fortunately, it is
823 * idempotent.
824 */
825 apply_paravirt(__parainstructions, __parainstructions_end);
826
827 vmi_bringup();
828
829 return 1;
830}
831
832#undef para_fill
833
834void __init vmi_init(void)
835{
836 if (!vmi_rom)
837 probe_vmi_rom();
838 else
839 check_vmi_rom(vmi_rom);
840
841 /* In case probing for or validating the ROM failed, basil */
842 if (!vmi_rom)
843 return;
844
845 reserve_top_address(-vmi_rom->virtual_top);
846
847#ifdef CONFIG_X86_IO_APIC
848 /* This is virtual hardware; timer routing is wired correctly */
849 no_timer_check = 1;
850#endif
851}
852
853void __init vmi_activate(void)
854{
855 unsigned long flags;
856
857 if (!vmi_rom)
858 return;
859
860 local_irq_save(flags);
861 activate_vmi();
862 local_irq_restore(flags & X86_EFLAGS_IF);
863}
864
865static int __init parse_vmi(char *arg)
866{
867 if (!arg)
868 return -EINVAL;
869
870 if (!strcmp(arg, "disable_pge")) {
871 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE);
872 disable_pge = 1;
873 } else if (!strcmp(arg, "disable_pse")) {
874 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PSE);
875 disable_pse = 1;
876 } else if (!strcmp(arg, "disable_sep")) {
877 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP);
878 disable_sep = 1;
879 } else if (!strcmp(arg, "disable_tsc")) {
880 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC);
881 disable_tsc = 1;
882 } else if (!strcmp(arg, "disable_mtrr")) {
883 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_MTRR);
884 disable_mtrr = 1;
885 } else if (!strcmp(arg, "disable_timer")) {
886 disable_vmi_timer = 1;
887 disable_noidle = 1;
888 } else if (!strcmp(arg, "disable_noidle"))
889 disable_noidle = 1;
890 return 0;
891}
892
893early_param("vmi", parse_vmi);
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
deleted file mode 100644
index 5e1ff66ecd73..000000000000
--- a/arch/x86/kernel/vmiclock_32.c
+++ /dev/null
@@ -1,317 +0,0 @@
1/*
2 * VMI paravirtual timer support routines.
3 *
4 * Copyright (C) 2007, VMware, Inc.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
14 * NON INFRINGEMENT. See the GNU General Public License for more
15 * details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 */
22
23#include <linux/smp.h>
24#include <linux/interrupt.h>
25#include <linux/cpumask.h>
26#include <linux/clocksource.h>
27#include <linux/clockchips.h>
28
29#include <asm/vmi.h>
30#include <asm/vmi_time.h>
31#include <asm/apicdef.h>
32#include <asm/apic.h>
33#include <asm/timer.h>
34#include <asm/i8253.h>
35#include <asm/irq_vectors.h>
36
37#define VMI_ONESHOT (VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
38#define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
39
40static DEFINE_PER_CPU(struct clock_event_device, local_events);
41
42static inline u32 vmi_counter(u32 flags)
43{
44 /* Given VMI_ONESHOT or VMI_PERIODIC, return the corresponding
45 * cycle counter. */
46 return flags & VMI_ALARM_COUNTER_MASK;
47}
48
49/* paravirt_ops.get_wallclock = vmi_get_wallclock */
50unsigned long vmi_get_wallclock(void)
51{
52 unsigned long long wallclock;
53 wallclock = vmi_timer_ops.get_wallclock(); // nsec
54 (void)do_div(wallclock, 1000000000); // sec
55
56 return wallclock;
57}
58
59/* paravirt_ops.set_wallclock = vmi_set_wallclock */
60int vmi_set_wallclock(unsigned long now)
61{
62 return 0;
63}
64
65/* paravirt_ops.sched_clock = vmi_sched_clock */
66unsigned long long vmi_sched_clock(void)
67{
68 return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE));
69}
70
71/* x86_platform.calibrate_tsc = vmi_tsc_khz */
72unsigned long vmi_tsc_khz(void)
73{
74 unsigned long long khz;
75 khz = vmi_timer_ops.get_cycle_frequency();
76 (void)do_div(khz, 1000);
77 return khz;
78}
79
80static inline unsigned int vmi_get_timer_vector(void)
81{
82 return IRQ0_VECTOR;
83}
84
85/** vmi clockchip */
86#ifdef CONFIG_X86_LOCAL_APIC
87static unsigned int startup_timer_irq(unsigned int irq)
88{
89 unsigned long val = apic_read(APIC_LVTT);
90 apic_write(APIC_LVTT, vmi_get_timer_vector());
91
92 return (val & APIC_SEND_PENDING);
93}
94
95static void mask_timer_irq(unsigned int irq)
96{
97 unsigned long val = apic_read(APIC_LVTT);
98 apic_write(APIC_LVTT, val | APIC_LVT_MASKED);
99}
100
101static void unmask_timer_irq(unsigned int irq)
102{
103 unsigned long val = apic_read(APIC_LVTT);
104 apic_write(APIC_LVTT, val & ~APIC_LVT_MASKED);
105}
106
107static void ack_timer_irq(unsigned int irq)
108{
109 ack_APIC_irq();
110}
111
112static struct irq_chip vmi_chip __read_mostly = {
113 .name = "VMI-LOCAL",
114 .startup = startup_timer_irq,
115 .mask = mask_timer_irq,
116 .unmask = unmask_timer_irq,
117 .ack = ack_timer_irq
118};
119#endif
120
121/** vmi clockevent */
122#define VMI_ALARM_WIRED_IRQ0 0x00000000
123#define VMI_ALARM_WIRED_LVTT 0x00010000
124static int vmi_wiring = VMI_ALARM_WIRED_IRQ0;
125
126static inline int vmi_get_alarm_wiring(void)
127{
128 return vmi_wiring;
129}
130
131static void vmi_timer_set_mode(enum clock_event_mode mode,
132 struct clock_event_device *evt)
133{
134 cycle_t now, cycles_per_hz;
135 BUG_ON(!irqs_disabled());
136
137 switch (mode) {
138 case CLOCK_EVT_MODE_ONESHOT:
139 case CLOCK_EVT_MODE_RESUME:
140 break;
141 case CLOCK_EVT_MODE_PERIODIC:
142 cycles_per_hz = vmi_timer_ops.get_cycle_frequency();
143 (void)do_div(cycles_per_hz, HZ);
144 now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_PERIODIC));
145 vmi_timer_ops.set_alarm(VMI_PERIODIC, now, cycles_per_hz);
146 break;
147 case CLOCK_EVT_MODE_UNUSED:
148 case CLOCK_EVT_MODE_SHUTDOWN:
149 switch (evt->mode) {
150 case CLOCK_EVT_MODE_ONESHOT:
151 vmi_timer_ops.cancel_alarm(VMI_ONESHOT);
152 break;
153 case CLOCK_EVT_MODE_PERIODIC:
154 vmi_timer_ops.cancel_alarm(VMI_PERIODIC);
155 break;
156 default:
157 break;
158 }
159 break;
160 default:
161 break;
162 }
163}
164
165static int vmi_timer_next_event(unsigned long delta,
166 struct clock_event_device *evt)
167{
168 /* Unfortunately, set_next_event interface only passes relative
169 * expiry, but we want absolute expiry. It'd be better if were
170 * were passed an absolute expiry, since a bunch of time may
171 * have been stolen between the time the delta is computed and
172 * when we set the alarm below. */
173 cycle_t now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_ONESHOT));
174
175 BUG_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
176 vmi_timer_ops.set_alarm(VMI_ONESHOT, now + delta, 0);
177 return 0;
178}
179
180static struct clock_event_device vmi_clockevent = {
181 .name = "vmi-timer",
182 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
183 .shift = 22,
184 .set_mode = vmi_timer_set_mode,
185 .set_next_event = vmi_timer_next_event,
186 .rating = 1000,
187 .irq = 0,
188};
189
190static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id)
191{
192 struct clock_event_device *evt = &__get_cpu_var(local_events);
193 evt->event_handler(evt);
194 return IRQ_HANDLED;
195}
196
197static struct irqaction vmi_clock_action = {
198 .name = "vmi-timer",
199 .handler = vmi_timer_interrupt,
200 .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TIMER,
201};
202
203static void __devinit vmi_time_init_clockevent(void)
204{
205 cycle_t cycles_per_msec;
206 struct clock_event_device *evt;
207
208 int cpu = smp_processor_id();
209 evt = &__get_cpu_var(local_events);
210
211 /* Use cycles_per_msec since div_sc params are 32-bits. */
212 cycles_per_msec = vmi_timer_ops.get_cycle_frequency();
213 (void)do_div(cycles_per_msec, 1000);
214
215 memcpy(evt, &vmi_clockevent, sizeof(*evt));
216 /* Must pick .shift such that .mult fits in 32-bits. Choosing
217 * .shift to be 22 allows 2^(32-22) cycles per nano-seconds
218 * before overflow. */
219 evt->mult = div_sc(cycles_per_msec, NSEC_PER_MSEC, evt->shift);
220 /* Upper bound is clockevent's use of ulong for cycle deltas. */
221 evt->max_delta_ns = clockevent_delta2ns(ULONG_MAX, evt);
222 evt->min_delta_ns = clockevent_delta2ns(1, evt);
223 evt->cpumask = cpumask_of(cpu);
224
225 printk(KERN_WARNING "vmi: registering clock event %s. mult=%u shift=%u\n",
226 evt->name, evt->mult, evt->shift);
227 clockevents_register_device(evt);
228}
229
230void __init vmi_time_init(void)
231{
232 unsigned int cpu;
233 /* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */
234 outb_pit(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */
235
236 vmi_time_init_clockevent();
237 setup_irq(0, &vmi_clock_action);
238 for_each_possible_cpu(cpu)
239 per_cpu(vector_irq, cpu)[vmi_get_timer_vector()] = 0;
240}
241
242#ifdef CONFIG_X86_LOCAL_APIC
243void __devinit vmi_time_bsp_init(void)
244{
245 /*
246 * On APIC systems, we want local timers to fire on each cpu. We do
247 * this by programming LVTT to deliver timer events to the IRQ handler
248 * for IRQ-0, since we can't re-use the APIC local timer handler
249 * without interfering with that code.
250 */
251 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
252 local_irq_disable();
253#ifdef CONFIG_SMP
254 /*
255 * XXX handle_percpu_irq only defined for SMP; we need to switch over
256 * to using it, since this is a local interrupt, which each CPU must
257 * handle individually without locking out or dropping simultaneous
258 * local timers on other CPUs. We also don't want to trigger the
259 * quirk workaround code for interrupts which gets invoked from
260 * handle_percpu_irq via eoi, so we use our own IRQ chip.
261 */
262 set_irq_chip_and_handler_name(0, &vmi_chip, handle_percpu_irq, "lvtt");
263#else
264 set_irq_chip_and_handler_name(0, &vmi_chip, handle_edge_irq, "lvtt");
265#endif
266 vmi_wiring = VMI_ALARM_WIRED_LVTT;
267 apic_write(APIC_LVTT, vmi_get_timer_vector());
268 local_irq_enable();
269 clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
270}
271
272void __devinit vmi_time_ap_init(void)
273{
274 vmi_time_init_clockevent();
275 apic_write(APIC_LVTT, vmi_get_timer_vector());
276}
277#endif
278
279/** vmi clocksource */
280static struct clocksource clocksource_vmi;
281
282static cycle_t read_real_cycles(struct clocksource *cs)
283{
284 cycle_t ret = (cycle_t)vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL);
285 return max(ret, clocksource_vmi.cycle_last);
286}
287
288static struct clocksource clocksource_vmi = {
289 .name = "vmi-timer",
290 .rating = 450,
291 .read = read_real_cycles,
292 .mask = CLOCKSOURCE_MASK(64),
293 .mult = 0, /* to be set */
294 .shift = 22,
295 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
296};
297
298static int __init init_vmi_clocksource(void)
299{
300 cycle_t cycles_per_msec;
301
302 if (!vmi_timer_ops.get_cycle_frequency)
303 return 0;
304 /* Use khz2mult rather than hz2mult since hz arg is only 32-bits. */
305 cycles_per_msec = vmi_timer_ops.get_cycle_frequency();
306 (void)do_div(cycles_per_msec, 1000);
307
308 /* Note that clocksource.{mult, shift} converts in the opposite direction
309 * as clockevents. */
310 clocksource_vmi.mult = clocksource_khz2mult(cycles_per_msec,
311 clocksource_vmi.shift);
312
313 printk(KERN_WARNING "vmi: registering clock source khz=%lld\n", cycles_per_msec);
314 return clocksource_register(&clocksource_vmi);
315
316}
317module_init(init_vmi_clocksource);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 77d8c0f4817d..22b06f7660f4 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1056,14 +1056,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
1056 1056
1057 vcpu->arch.apic = apic; 1057 vcpu->arch.apic = apic;
1058 1058
1059 apic->regs_page = alloc_page(GFP_KERNEL); 1059 apic->regs_page = alloc_page(GFP_KERNEL|__GFP_ZERO);
1060 if (apic->regs_page == NULL) { 1060 if (apic->regs_page == NULL) {
1061 printk(KERN_ERR "malloc apic regs error for vcpu %x\n", 1061 printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
1062 vcpu->vcpu_id); 1062 vcpu->vcpu_id);
1063 goto nomem_free_apic; 1063 goto nomem_free_apic;
1064 } 1064 }
1065 apic->regs = page_address(apic->regs_page); 1065 apic->regs = page_address(apic->regs_page);
1066 memset(apic->regs, 0, PAGE_SIZE);
1067 apic->vcpu = vcpu; 1066 apic->vcpu = vcpu;
1068 1067
1069 hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, 1068 hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3a09c625d526..6c2ecf0a806d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1991,13 +1991,14 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1991 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | 1991 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
1992 0 /* Reserved, DCA */ | F(XMM4_1) | 1992 0 /* Reserved, DCA */ | F(XMM4_1) |
1993 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | 1993 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
1994 0 /* Reserved, AES */ | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX); 1994 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
1995 F(F16C);
1995 /* cpuid 0x80000001.ecx */ 1996 /* cpuid 0x80000001.ecx */
1996 const u32 kvm_supported_word6_x86_features = 1997 const u32 kvm_supported_word6_x86_features =
1997 F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ | 1998 F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
1998 F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | 1999 F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
1999 F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) | 2000 F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) |
2000 0 /* SKINIT */ | 0 /* WDT */; 2001 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
2001 2002
2002 /* all calls to cpuid_count() should be made on the same cpu */ 2003 /* all calls to cpuid_count() should be made on the same cpu */
2003 get_cpu(); 2004 get_cpu();
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 9d5f55848455..73b1e1a1f489 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -791,22 +791,22 @@ static void lguest_flush_tlb_kernel(void)
791 * simple as setting a bit. We don't actually "ack" interrupts as such, we 791 * simple as setting a bit. We don't actually "ack" interrupts as such, we
792 * just mask and unmask them. I wonder if we should be cleverer? 792 * just mask and unmask them. I wonder if we should be cleverer?
793 */ 793 */
794static void disable_lguest_irq(unsigned int irq) 794static void disable_lguest_irq(struct irq_data *data)
795{ 795{
796 set_bit(irq, lguest_data.blocked_interrupts); 796 set_bit(data->irq, lguest_data.blocked_interrupts);
797} 797}
798 798
799static void enable_lguest_irq(unsigned int irq) 799static void enable_lguest_irq(struct irq_data *data)
800{ 800{
801 clear_bit(irq, lguest_data.blocked_interrupts); 801 clear_bit(data->irq, lguest_data.blocked_interrupts);
802} 802}
803 803
804/* This structure describes the lguest IRQ controller. */ 804/* This structure describes the lguest IRQ controller. */
805static struct irq_chip lguest_irq_controller = { 805static struct irq_chip lguest_irq_controller = {
806 .name = "lguest", 806 .name = "lguest",
807 .mask = disable_lguest_irq, 807 .irq_mask = disable_lguest_irq,
808 .mask_ack = disable_lguest_irq, 808 .irq_mask_ack = disable_lguest_irq,
809 .unmask = enable_lguest_irq, 809 .irq_unmask = enable_lguest_irq,
810}; 810};
811 811
812/* 812/*
@@ -838,12 +838,12 @@ static void __init lguest_init_IRQ(void)
838 * rather than set them in lguest_init_IRQ we are called here every time an 838 * rather than set them in lguest_init_IRQ we are called here every time an
839 * lguest device needs an interrupt. 839 * lguest device needs an interrupt.
840 * 840 *
841 * FIXME: irq_to_desc_alloc_node() can fail due to lack of memory, we should 841 * FIXME: irq_alloc_desc_at() can fail due to lack of memory, we should
842 * pass that up! 842 * pass that up!
843 */ 843 */
844void lguest_setup_irq(unsigned int irq) 844void lguest_setup_irq(unsigned int irq)
845{ 845{
846 irq_to_desc_alloc_node(irq, 0); 846 irq_alloc_desc_at(irq, 0);
847 set_irq_chip_and_handler_name(irq, &lguest_irq_controller, 847 set_irq_chip_and_handler_name(irq, &lguest_irq_controller,
848 handle_level_irq, "level"); 848 handle_level_irq, "level");
849} 849}
diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c
index 5415a9d06f53..b908a59eccf5 100644
--- a/arch/x86/lib/memcpy_32.c
+++ b/arch/x86/lib/memcpy_32.c
@@ -22,22 +22,187 @@ EXPORT_SYMBOL(memset);
22 22
23void *memmove(void *dest, const void *src, size_t n) 23void *memmove(void *dest, const void *src, size_t n)
24{ 24{
25 int d0, d1, d2; 25 int d0,d1,d2,d3,d4,d5;
26 26 char *ret = dest;
27 if (dest < src) { 27
28 memcpy(dest, src, n); 28 __asm__ __volatile__(
29 } else { 29 /* Handle more 16bytes in loop */
30 __asm__ __volatile__( 30 "cmp $0x10, %0\n\t"
31 "std\n\t" 31 "jb 1f\n\t"
32 "rep\n\t" 32
33 "movsb\n\t" 33 /* Decide forward/backward copy mode */
34 "cld" 34 "cmp %2, %1\n\t"
35 : "=&c" (d0), "=&S" (d1), "=&D" (d2) 35 "jb 2f\n\t"
36 :"0" (n), 36
37 "1" (n-1+src), 37 /*
38 "2" (n-1+dest) 38 * movs instruction have many startup latency
39 :"memory"); 39 * so we handle small size by general register.
40 } 40 */
41 return dest; 41 "cmp $680, %0\n\t"
42 "jb 3f\n\t"
43 /*
44 * movs instruction is only good for aligned case.
45 */
46 "mov %1, %3\n\t"
47 "xor %2, %3\n\t"
48 "and $0xff, %3\n\t"
49 "jz 4f\n\t"
50 "3:\n\t"
51 "sub $0x10, %0\n\t"
52
53 /*
54 * We gobble 16byts forward in each loop.
55 */
56 "3:\n\t"
57 "sub $0x10, %0\n\t"
58 "mov 0*4(%1), %3\n\t"
59 "mov 1*4(%1), %4\n\t"
60 "mov %3, 0*4(%2)\n\t"
61 "mov %4, 1*4(%2)\n\t"
62 "mov 2*4(%1), %3\n\t"
63 "mov 3*4(%1), %4\n\t"
64 "mov %3, 2*4(%2)\n\t"
65 "mov %4, 3*4(%2)\n\t"
66 "lea 0x10(%1), %1\n\t"
67 "lea 0x10(%2), %2\n\t"
68 "jae 3b\n\t"
69 "add $0x10, %0\n\t"
70 "jmp 1f\n\t"
71
72 /*
73 * Handle data forward by movs.
74 */
75 ".p2align 4\n\t"
76 "4:\n\t"
77 "mov -4(%1, %0), %3\n\t"
78 "lea -4(%2, %0), %4\n\t"
79 "shr $2, %0\n\t"
80 "rep movsl\n\t"
81 "mov %3, (%4)\n\t"
82 "jmp 11f\n\t"
83 /*
84 * Handle data backward by movs.
85 */
86 ".p2align 4\n\t"
87 "6:\n\t"
88 "mov (%1), %3\n\t"
89 "mov %2, %4\n\t"
90 "lea -4(%1, %0), %1\n\t"
91 "lea -4(%2, %0), %2\n\t"
92 "shr $2, %0\n\t"
93 "std\n\t"
94 "rep movsl\n\t"
95 "mov %3,(%4)\n\t"
96 "cld\n\t"
97 "jmp 11f\n\t"
98
99 /*
100 * Start to prepare for backward copy.
101 */
102 ".p2align 4\n\t"
103 "2:\n\t"
104 "cmp $680, %0\n\t"
105 "jb 5f\n\t"
106 "mov %1, %3\n\t"
107 "xor %2, %3\n\t"
108 "and $0xff, %3\n\t"
109 "jz 6b\n\t"
110
111 /*
112 * Calculate copy position to tail.
113 */
114 "5:\n\t"
115 "add %0, %1\n\t"
116 "add %0, %2\n\t"
117 "sub $0x10, %0\n\t"
118
119 /*
120 * We gobble 16byts backward in each loop.
121 */
122 "7:\n\t"
123 "sub $0x10, %0\n\t"
124
125 "mov -1*4(%1), %3\n\t"
126 "mov -2*4(%1), %4\n\t"
127 "mov %3, -1*4(%2)\n\t"
128 "mov %4, -2*4(%2)\n\t"
129 "mov -3*4(%1), %3\n\t"
130 "mov -4*4(%1), %4\n\t"
131 "mov %3, -3*4(%2)\n\t"
132 "mov %4, -4*4(%2)\n\t"
133 "lea -0x10(%1), %1\n\t"
134 "lea -0x10(%2), %2\n\t"
135 "jae 7b\n\t"
136 /*
137 * Calculate copy position to head.
138 */
139 "add $0x10, %0\n\t"
140 "sub %0, %1\n\t"
141 "sub %0, %2\n\t"
142
143 /*
144 * Move data from 8 bytes to 15 bytes.
145 */
146 ".p2align 4\n\t"
147 "1:\n\t"
148 "cmp $8, %0\n\t"
149 "jb 8f\n\t"
150 "mov 0*4(%1), %3\n\t"
151 "mov 1*4(%1), %4\n\t"
152 "mov -2*4(%1, %0), %5\n\t"
153 "mov -1*4(%1, %0), %1\n\t"
154
155 "mov %3, 0*4(%2)\n\t"
156 "mov %4, 1*4(%2)\n\t"
157 "mov %5, -2*4(%2, %0)\n\t"
158 "mov %1, -1*4(%2, %0)\n\t"
159 "jmp 11f\n\t"
160
161 /*
162 * Move data from 4 bytes to 7 bytes.
163 */
164 ".p2align 4\n\t"
165 "8:\n\t"
166 "cmp $4, %0\n\t"
167 "jb 9f\n\t"
168 "mov 0*4(%1), %3\n\t"
169 "mov -1*4(%1, %0), %4\n\t"
170 "mov %3, 0*4(%2)\n\t"
171 "mov %4, -1*4(%2, %0)\n\t"
172 "jmp 11f\n\t"
173
174 /*
175 * Move data from 2 bytes to 3 bytes.
176 */
177 ".p2align 4\n\t"
178 "9:\n\t"
179 "cmp $2, %0\n\t"
180 "jb 10f\n\t"
181 "movw 0*2(%1), %%dx\n\t"
182 "movw -1*2(%1, %0), %%bx\n\t"
183 "movw %%dx, 0*2(%2)\n\t"
184 "movw %%bx, -1*2(%2, %0)\n\t"
185 "jmp 11f\n\t"
186
187 /*
188 * Move data for 1 byte.
189 */
190 ".p2align 4\n\t"
191 "10:\n\t"
192 "cmp $1, %0\n\t"
193 "jb 11f\n\t"
194 "movb (%1), %%cl\n\t"
195 "movb %%cl, (%2)\n\t"
196 ".p2align 4\n\t"
197 "11:"
198 : "=&c" (d0), "=&S" (d1), "=&D" (d2),
199 "=r" (d3),"=r" (d4), "=r"(d5)
200 :"0" (n),
201 "1" (src),
202 "2" (dest)
203 :"memory");
204
205 return ret;
206
42} 207}
43EXPORT_SYMBOL(memmove); 208EXPORT_SYMBOL(memmove);
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index bcbcd1e0f7d5..75ef61e35e38 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -40,84 +40,132 @@
40ENTRY(__memcpy) 40ENTRY(__memcpy)
41ENTRY(memcpy) 41ENTRY(memcpy)
42 CFI_STARTPROC 42 CFI_STARTPROC
43 movq %rdi, %rax
43 44
44 /* 45 /*
45 * Put the number of full 64-byte blocks into %ecx. 46 * Use 32bit CMP here to avoid long NOP padding.
46 * Tail portion is handled at the end:
47 */ 47 */
48 movq %rdi, %rax 48 cmp $0x20, %edx
49 movl %edx, %ecx 49 jb .Lhandle_tail
50 shrl $6, %ecx
51 jz .Lhandle_tail
52 50
53 .p2align 4
54.Lloop_64:
55 /* 51 /*
56 * We decrement the loop index here - and the zero-flag is 52 * We check whether memory false dependece could occur,
57 * checked at the end of the loop (instructions inbetween do 53 * then jump to corresponding copy mode.
58 * not change the zero flag):
59 */ 54 */
60 decl %ecx 55 cmp %dil, %sil
56 jl .Lcopy_backward
57 subl $0x20, %edx
58.Lcopy_forward_loop:
59 subq $0x20, %rdx
61 60
62 /* 61 /*
63 * Move in blocks of 4x16 bytes: 62 * Move in blocks of 4x8 bytes:
64 */ 63 */
65 movq 0*8(%rsi), %r11 64 movq 0*8(%rsi), %r8
66 movq 1*8(%rsi), %r8 65 movq 1*8(%rsi), %r9
67 movq %r11, 0*8(%rdi) 66 movq 2*8(%rsi), %r10
68 movq %r8, 1*8(%rdi) 67 movq 3*8(%rsi), %r11
69 68 leaq 4*8(%rsi), %rsi
70 movq 2*8(%rsi), %r9 69
71 movq 3*8(%rsi), %r10 70 movq %r8, 0*8(%rdi)
72 movq %r9, 2*8(%rdi) 71 movq %r9, 1*8(%rdi)
73 movq %r10, 3*8(%rdi) 72 movq %r10, 2*8(%rdi)
74 73 movq %r11, 3*8(%rdi)
75 movq 4*8(%rsi), %r11 74 leaq 4*8(%rdi), %rdi
76 movq 5*8(%rsi), %r8 75 jae .Lcopy_forward_loop
77 movq %r11, 4*8(%rdi) 76 addq $0x20, %rdx
78 movq %r8, 5*8(%rdi) 77 jmp .Lhandle_tail
79 78
80 movq 6*8(%rsi), %r9 79.Lcopy_backward:
81 movq 7*8(%rsi), %r10 80 /*
82 movq %r9, 6*8(%rdi) 81 * Calculate copy position to tail.
83 movq %r10, 7*8(%rdi) 82 */
84 83 addq %rdx, %rsi
85 leaq 64(%rsi), %rsi 84 addq %rdx, %rdi
86 leaq 64(%rdi), %rdi 85 subq $0x20, %rdx
87 86 /*
88 jnz .Lloop_64 87 * At most 3 ALU operations in one cycle,
88 * so append NOPS in the same 16bytes trunk.
89 */
90 .p2align 4
91.Lcopy_backward_loop:
92 subq $0x20, %rdx
93 movq -1*8(%rsi), %r8
94 movq -2*8(%rsi), %r9
95 movq -3*8(%rsi), %r10
96 movq -4*8(%rsi), %r11
97 leaq -4*8(%rsi), %rsi
98 movq %r8, -1*8(%rdi)
99 movq %r9, -2*8(%rdi)
100 movq %r10, -3*8(%rdi)
101 movq %r11, -4*8(%rdi)
102 leaq -4*8(%rdi), %rdi
103 jae .Lcopy_backward_loop
89 104
105 /*
106 * Calculate copy position to head.
107 */
108 addq $0x20, %rdx
109 subq %rdx, %rsi
110 subq %rdx, %rdi
90.Lhandle_tail: 111.Lhandle_tail:
91 movl %edx, %ecx 112 cmpq $16, %rdx
92 andl $63, %ecx 113 jb .Lless_16bytes
93 shrl $3, %ecx
94 jz .Lhandle_7
95 114
115 /*
116 * Move data from 16 bytes to 31 bytes.
117 */
118 movq 0*8(%rsi), %r8
119 movq 1*8(%rsi), %r9
120 movq -2*8(%rsi, %rdx), %r10
121 movq -1*8(%rsi, %rdx), %r11
122 movq %r8, 0*8(%rdi)
123 movq %r9, 1*8(%rdi)
124 movq %r10, -2*8(%rdi, %rdx)
125 movq %r11, -1*8(%rdi, %rdx)
126 retq
96 .p2align 4 127 .p2align 4
97.Lloop_8: 128.Lless_16bytes:
98 decl %ecx 129 cmpq $8, %rdx
99 movq (%rsi), %r8 130 jb .Lless_8bytes
100 movq %r8, (%rdi) 131 /*
101 leaq 8(%rdi), %rdi 132 * Move data from 8 bytes to 15 bytes.
102 leaq 8(%rsi), %rsi 133 */
103 jnz .Lloop_8 134 movq 0*8(%rsi), %r8
104 135 movq -1*8(%rsi, %rdx), %r9
105.Lhandle_7: 136 movq %r8, 0*8(%rdi)
106 movl %edx, %ecx 137 movq %r9, -1*8(%rdi, %rdx)
107 andl $7, %ecx 138 retq
108 jz .Lend 139 .p2align 4
140.Lless_8bytes:
141 cmpq $4, %rdx
142 jb .Lless_3bytes
109 143
144 /*
145 * Move data from 4 bytes to 7 bytes.
146 */
147 movl (%rsi), %ecx
148 movl -4(%rsi, %rdx), %r8d
149 movl %ecx, (%rdi)
150 movl %r8d, -4(%rdi, %rdx)
151 retq
110 .p2align 4 152 .p2align 4
153.Lless_3bytes:
154 cmpl $0, %edx
155 je .Lend
156 /*
157 * Move data from 1 bytes to 3 bytes.
158 */
111.Lloop_1: 159.Lloop_1:
112 movb (%rsi), %r8b 160 movb (%rsi), %r8b
113 movb %r8b, (%rdi) 161 movb %r8b, (%rdi)
114 incq %rdi 162 incq %rdi
115 incq %rsi 163 incq %rsi
116 decl %ecx 164 decl %edx
117 jnz .Lloop_1 165 jnz .Lloop_1
118 166
119.Lend: 167.Lend:
120 ret 168 retq
121 CFI_ENDPROC 169 CFI_ENDPROC
122ENDPROC(memcpy) 170ENDPROC(memcpy)
123ENDPROC(__memcpy) 171ENDPROC(__memcpy)
diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c
index 0a33909bf122..6d0f0ec41b34 100644
--- a/arch/x86/lib/memmove_64.c
+++ b/arch/x86/lib/memmove_64.c
@@ -8,14 +8,185 @@
8#undef memmove 8#undef memmove
9void *memmove(void *dest, const void *src, size_t count) 9void *memmove(void *dest, const void *src, size_t count)
10{ 10{
11 if (dest < src) { 11 unsigned long d0,d1,d2,d3,d4,d5,d6,d7;
12 return memcpy(dest, src, count); 12 char *ret;
13 } else { 13
14 char *p = dest + count; 14 __asm__ __volatile__(
15 const char *s = src + count; 15 /* Handle more 32bytes in loop */
16 while (count--) 16 "mov %2, %3\n\t"
17 *--p = *--s; 17 "cmp $0x20, %0\n\t"
18 } 18 "jb 1f\n\t"
19 return dest; 19
20 /* Decide forward/backward copy mode */
21 "cmp %2, %1\n\t"
22 "jb 2f\n\t"
23
24 /*
25 * movsq instruction have many startup latency
26 * so we handle small size by general register.
27 */
28 "cmp $680, %0\n\t"
29 "jb 3f\n\t"
30 /*
31 * movsq instruction is only good for aligned case.
32 */
33 "cmpb %%dil, %%sil\n\t"
34 "je 4f\n\t"
35 "3:\n\t"
36 "sub $0x20, %0\n\t"
37 /*
38 * We gobble 32byts forward in each loop.
39 */
40 "5:\n\t"
41 "sub $0x20, %0\n\t"
42 "movq 0*8(%1), %4\n\t"
43 "movq 1*8(%1), %5\n\t"
44 "movq 2*8(%1), %6\n\t"
45 "movq 3*8(%1), %7\n\t"
46 "leaq 4*8(%1), %1\n\t"
47
48 "movq %4, 0*8(%2)\n\t"
49 "movq %5, 1*8(%2)\n\t"
50 "movq %6, 2*8(%2)\n\t"
51 "movq %7, 3*8(%2)\n\t"
52 "leaq 4*8(%2), %2\n\t"
53 "jae 5b\n\t"
54 "addq $0x20, %0\n\t"
55 "jmp 1f\n\t"
56 /*
57 * Handle data forward by movsq.
58 */
59 ".p2align 4\n\t"
60 "4:\n\t"
61 "movq %0, %8\n\t"
62 "movq -8(%1, %0), %4\n\t"
63 "lea -8(%2, %0), %5\n\t"
64 "shrq $3, %8\n\t"
65 "rep movsq\n\t"
66 "movq %4, (%5)\n\t"
67 "jmp 13f\n\t"
68 /*
69 * Handle data backward by movsq.
70 */
71 ".p2align 4\n\t"
72 "7:\n\t"
73 "movq %0, %8\n\t"
74 "movq (%1), %4\n\t"
75 "movq %2, %5\n\t"
76 "leaq -8(%1, %0), %1\n\t"
77 "leaq -8(%2, %0), %2\n\t"
78 "shrq $3, %8\n\t"
79 "std\n\t"
80 "rep movsq\n\t"
81 "cld\n\t"
82 "movq %4, (%5)\n\t"
83 "jmp 13f\n\t"
84
85 /*
86 * Start to prepare for backward copy.
87 */
88 ".p2align 4\n\t"
89 "2:\n\t"
90 "cmp $680, %0\n\t"
91 "jb 6f \n\t"
92 "cmp %%dil, %%sil\n\t"
93 "je 7b \n\t"
94 "6:\n\t"
95 /*
96 * Calculate copy position to tail.
97 */
98 "addq %0, %1\n\t"
99 "addq %0, %2\n\t"
100 "subq $0x20, %0\n\t"
101 /*
102 * We gobble 32byts backward in each loop.
103 */
104 "8:\n\t"
105 "subq $0x20, %0\n\t"
106 "movq -1*8(%1), %4\n\t"
107 "movq -2*8(%1), %5\n\t"
108 "movq -3*8(%1), %6\n\t"
109 "movq -4*8(%1), %7\n\t"
110 "leaq -4*8(%1), %1\n\t"
111
112 "movq %4, -1*8(%2)\n\t"
113 "movq %5, -2*8(%2)\n\t"
114 "movq %6, -3*8(%2)\n\t"
115 "movq %7, -4*8(%2)\n\t"
116 "leaq -4*8(%2), %2\n\t"
117 "jae 8b\n\t"
118 /*
119 * Calculate copy position to head.
120 */
121 "addq $0x20, %0\n\t"
122 "subq %0, %1\n\t"
123 "subq %0, %2\n\t"
124 "1:\n\t"
125 "cmpq $16, %0\n\t"
126 "jb 9f\n\t"
127 /*
128 * Move data from 16 bytes to 31 bytes.
129 */
130 "movq 0*8(%1), %4\n\t"
131 "movq 1*8(%1), %5\n\t"
132 "movq -2*8(%1, %0), %6\n\t"
133 "movq -1*8(%1, %0), %7\n\t"
134 "movq %4, 0*8(%2)\n\t"
135 "movq %5, 1*8(%2)\n\t"
136 "movq %6, -2*8(%2, %0)\n\t"
137 "movq %7, -1*8(%2, %0)\n\t"
138 "jmp 13f\n\t"
139 ".p2align 4\n\t"
140 "9:\n\t"
141 "cmpq $8, %0\n\t"
142 "jb 10f\n\t"
143 /*
144 * Move data from 8 bytes to 15 bytes.
145 */
146 "movq 0*8(%1), %4\n\t"
147 "movq -1*8(%1, %0), %5\n\t"
148 "movq %4, 0*8(%2)\n\t"
149 "movq %5, -1*8(%2, %0)\n\t"
150 "jmp 13f\n\t"
151 "10:\n\t"
152 "cmpq $4, %0\n\t"
153 "jb 11f\n\t"
154 /*
155 * Move data from 4 bytes to 7 bytes.
156 */
157 "movl (%1), %4d\n\t"
158 "movl -4(%1, %0), %5d\n\t"
159 "movl %4d, (%2)\n\t"
160 "movl %5d, -4(%2, %0)\n\t"
161 "jmp 13f\n\t"
162 "11:\n\t"
163 "cmp $2, %0\n\t"
164 "jb 12f\n\t"
165 /*
166 * Move data from 2 bytes to 3 bytes.
167 */
168 "movw (%1), %4w\n\t"
169 "movw -2(%1, %0), %5w\n\t"
170 "movw %4w, (%2)\n\t"
171 "movw %5w, -2(%2, %0)\n\t"
172 "jmp 13f\n\t"
173 "12:\n\t"
174 "cmp $1, %0\n\t"
175 "jb 13f\n\t"
176 /*
177 * Move data for 1 byte.
178 */
179 "movb (%1), %4b\n\t"
180 "movb %4b, (%2)\n\t"
181 "13:\n\t"
182 : "=&d" (d0), "=&S" (d1), "=&D" (d2), "=&a" (ret) ,
183 "=r"(d3), "=r"(d4), "=r"(d5), "=r"(d6), "=&c" (d7)
184 :"0" (count),
185 "1" (src),
186 "2" (dest)
187 :"memory");
188
189 return ret;
190
20} 191}
21EXPORT_SYMBOL(memmove); 192EXPORT_SYMBOL(memmove);
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index a24c6cfdccc4..79b0b372d2d0 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -229,7 +229,16 @@ void vmalloc_sync_all(void)
229 229
230 spin_lock_irqsave(&pgd_lock, flags); 230 spin_lock_irqsave(&pgd_lock, flags);
231 list_for_each_entry(page, &pgd_list, lru) { 231 list_for_each_entry(page, &pgd_list, lru) {
232 if (!vmalloc_sync_one(page_address(page), address)) 232 spinlock_t *pgt_lock;
233 pmd_t *ret;
234
235 pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
236
237 spin_lock(pgt_lock);
238 ret = vmalloc_sync_one(page_address(page), address);
239 spin_unlock(pgt_lock);
240
241 if (!ret)
233 break; 242 break;
234 } 243 }
235 spin_unlock_irqrestore(&pgd_lock, flags); 244 spin_unlock_irqrestore(&pgd_lock, flags);
@@ -328,29 +337,7 @@ out:
328 337
329void vmalloc_sync_all(void) 338void vmalloc_sync_all(void)
330{ 339{
331 unsigned long address; 340 sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END);
332
333 for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END;
334 address += PGDIR_SIZE) {
335
336 const pgd_t *pgd_ref = pgd_offset_k(address);
337 unsigned long flags;
338 struct page *page;
339
340 if (pgd_none(*pgd_ref))
341 continue;
342
343 spin_lock_irqsave(&pgd_lock, flags);
344 list_for_each_entry(page, &pgd_list, lru) {
345 pgd_t *pgd;
346 pgd = (pgd_t *)page_address(page) + pgd_index(address);
347 if (pgd_none(*pgd))
348 set_pgd(pgd, *pgd_ref);
349 else
350 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
351 }
352 spin_unlock_irqrestore(&pgd_lock, flags);
353 }
354} 341}
355 342
356/* 343/*
@@ -898,8 +885,14 @@ spurious_fault(unsigned long error_code, unsigned long address)
898 if (pmd_large(*pmd)) 885 if (pmd_large(*pmd))
899 return spurious_fault_check(error_code, (pte_t *) pmd); 886 return spurious_fault_check(error_code, (pte_t *) pmd);
900 887
888 /*
889 * Note: don't use pte_present() here, since it returns true
890 * if the _PAGE_PROTNONE bit is set. However, this aliases the
891 * _PAGE_GLOBAL bit, which for kernel pages give false positives
892 * when CONFIG_DEBUG_PAGEALLOC is used.
893 */
901 pte = pte_offset_kernel(pmd, address); 894 pte = pte_offset_kernel(pmd, address);
902 if (!pte_present(*pte)) 895 if (!(pte_flags(*pte) & _PAGE_PRESENT))
903 return 0; 896 return 0;
904 897
905 ret = spurious_fault_check(error_code, pte); 898 ret = spurious_fault_check(error_code, pte);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index bca79091b9d6..558f2d332076 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -67,7 +67,7 @@ static __init void *alloc_low_page(void)
67 panic("alloc_low_page: ran out of memory"); 67 panic("alloc_low_page: ran out of memory");
68 68
69 adr = __va(pfn * PAGE_SIZE); 69 adr = __va(pfn * PAGE_SIZE);
70 memset(adr, 0, PAGE_SIZE); 70 clear_page(adr);
71 return adr; 71 return adr;
72} 72}
73 73
@@ -558,7 +558,7 @@ char swsusp_pg_dir[PAGE_SIZE]
558 558
559static inline void save_pg_dir(void) 559static inline void save_pg_dir(void)
560{ 560{
561 memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE); 561 copy_page(swsusp_pg_dir, swapper_pg_dir);
562} 562}
563#else /* !CONFIG_ACPI_SLEEP */ 563#else /* !CONFIG_ACPI_SLEEP */
564static inline void save_pg_dir(void) 564static inline void save_pg_dir(void)
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 9a6674689a20..c55f900fbf89 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -98,6 +98,43 @@ static int __init nonx32_setup(char *str)
98__setup("noexec32=", nonx32_setup); 98__setup("noexec32=", nonx32_setup);
99 99
100/* 100/*
101 * When memory was added/removed make sure all the processes MM have
102 * suitable PGD entries in the local PGD level page.
103 */
104void sync_global_pgds(unsigned long start, unsigned long end)
105{
106 unsigned long address;
107
108 for (address = start; address <= end; address += PGDIR_SIZE) {
109 const pgd_t *pgd_ref = pgd_offset_k(address);
110 unsigned long flags;
111 struct page *page;
112
113 if (pgd_none(*pgd_ref))
114 continue;
115
116 spin_lock_irqsave(&pgd_lock, flags);
117 list_for_each_entry(page, &pgd_list, lru) {
118 pgd_t *pgd;
119 spinlock_t *pgt_lock;
120
121 pgd = (pgd_t *)page_address(page) + pgd_index(address);
122 pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
123 spin_lock(pgt_lock);
124
125 if (pgd_none(*pgd))
126 set_pgd(pgd, *pgd_ref);
127 else
128 BUG_ON(pgd_page_vaddr(*pgd)
129 != pgd_page_vaddr(*pgd_ref));
130
131 spin_unlock(pgt_lock);
132 }
133 spin_unlock_irqrestore(&pgd_lock, flags);
134 }
135}
136
137/*
101 * NOTE: This function is marked __ref because it calls __init function 138 * NOTE: This function is marked __ref because it calls __init function
102 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. 139 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
103 */ 140 */
@@ -293,7 +330,7 @@ static __ref void *alloc_low_page(unsigned long *phys)
293 panic("alloc_low_page: ran out of memory"); 330 panic("alloc_low_page: ran out of memory");
294 331
295 adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); 332 adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
296 memset(adr, 0, PAGE_SIZE); 333 clear_page(adr);
297 *phys = pfn * PAGE_SIZE; 334 *phys = pfn * PAGE_SIZE;
298 return adr; 335 return adr;
299} 336}
@@ -534,11 +571,13 @@ kernel_physical_mapping_init(unsigned long start,
534 unsigned long end, 571 unsigned long end,
535 unsigned long page_size_mask) 572 unsigned long page_size_mask)
536{ 573{
537 574 bool pgd_changed = false;
538 unsigned long next, last_map_addr = end; 575 unsigned long next, last_map_addr = end;
576 unsigned long addr;
539 577
540 start = (unsigned long)__va(start); 578 start = (unsigned long)__va(start);
541 end = (unsigned long)__va(end); 579 end = (unsigned long)__va(end);
580 addr = start;
542 581
543 for (; start < end; start = next) { 582 for (; start < end; start = next) {
544 pgd_t *pgd = pgd_offset_k(start); 583 pgd_t *pgd = pgd_offset_k(start);
@@ -563,7 +602,12 @@ kernel_physical_mapping_init(unsigned long start,
563 spin_lock(&init_mm.page_table_lock); 602 spin_lock(&init_mm.page_table_lock);
564 pgd_populate(&init_mm, pgd, __va(pud_phys)); 603 pgd_populate(&init_mm, pgd, __va(pud_phys));
565 spin_unlock(&init_mm.page_table_lock); 604 spin_unlock(&init_mm.page_table_lock);
605 pgd_changed = true;
566 } 606 }
607
608 if (pgd_changed)
609 sync_global_pgds(addr, end);
610
567 __flush_tlb_all(); 611 __flush_tlb_all();
568 612
569 return last_map_addr; 613 return last_map_addr;
@@ -1003,6 +1047,7 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
1003 } 1047 }
1004 1048
1005 } 1049 }
1050 sync_global_pgds((unsigned long)start_page, end);
1006 return 0; 1051 return 0;
1007} 1052}
1008 1053
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c
index 970ed579d4e4..52d54bfc1ebb 100644
--- a/arch/x86/mm/k8topology_64.c
+++ b/arch/x86/mm/k8topology_64.c
@@ -22,7 +22,7 @@
22#include <asm/numa.h> 22#include <asm/numa.h>
23#include <asm/mpspec.h> 23#include <asm/mpspec.h>
24#include <asm/apic.h> 24#include <asm/apic.h>
25#include <asm/k8.h> 25#include <asm/amd_nb.h>
26 26
27static struct bootnode __initdata nodes[8]; 27static struct bootnode __initdata nodes[8];
28static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE; 28static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE;
@@ -54,8 +54,8 @@ static __init int find_northbridge(void)
54static __init void early_get_boot_cpu_id(void) 54static __init void early_get_boot_cpu_id(void)
55{ 55{
56 /* 56 /*
57 * need to get boot_cpu_id so can use that to create apicid_to_node 57 * need to get the APIC ID of the BSP so can use that to
58 * in k8_scan_nodes() 58 * create apicid_to_node in k8_scan_nodes()
59 */ 59 */
60#ifdef CONFIG_X86_MPPARSE 60#ifdef CONFIG_X86_MPPARSE
61 /* 61 /*
@@ -212,7 +212,7 @@ int __init k8_scan_nodes(void)
212 bits = boot_cpu_data.x86_coreid_bits; 212 bits = boot_cpu_data.x86_coreid_bits;
213 cores = (1<<bits); 213 cores = (1<<bits);
214 apicid_base = 0; 214 apicid_base = 0;
215 /* need to get boot_cpu_id early for system with apicid lifting */ 215 /* get the APIC ID of the BSP early for systems with apicid lifting */
216 early_get_boot_cpu_id(); 216 early_get_boot_cpu_id();
217 if (boot_cpu_physical_apicid > 0) { 217 if (boot_cpu_physical_apicid > 0) {
218 pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid); 218 pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid);
diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c
index 63c19e27aa6f..324aa3f07237 100644
--- a/arch/x86/mm/kmemcheck/opcode.c
+++ b/arch/x86/mm/kmemcheck/opcode.c
@@ -9,7 +9,7 @@ static bool opcode_is_prefix(uint8_t b)
9 b == 0xf0 || b == 0xf2 || b == 0xf3 9 b == 0xf0 || b == 0xf2 || b == 0xf3
10 /* Group 2 */ 10 /* Group 2 */
11 || b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26 11 || b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26
12 || b == 0x64 || b == 0x65 || b == 0x2e || b == 0x3e 12 || b == 0x64 || b == 0x65
13 /* Group 3 */ 13 /* Group 3 */
14 || b == 0x66 14 || b == 0x66
15 /* Group 4 */ 15 /* Group 4 */
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index a7bcc23ef96c..4962f1aeda6f 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -18,7 +18,7 @@
18#include <asm/dma.h> 18#include <asm/dma.h>
19#include <asm/numa.h> 19#include <asm/numa.h>
20#include <asm/acpi.h> 20#include <asm/acpi.h>
21#include <asm/k8.h> 21#include <asm/amd_nb.h>
22 22
23struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 23struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
24EXPORT_SYMBOL(node_data); 24EXPORT_SYMBOL(node_data);
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 5c4ee422590e..8be8c7d7bc89 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -87,7 +87,19 @@ static inline void pgd_list_del(pgd_t *pgd)
87#define UNSHARED_PTRS_PER_PGD \ 87#define UNSHARED_PTRS_PER_PGD \
88 (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) 88 (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
89 89
90static void pgd_ctor(pgd_t *pgd) 90
91static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
92{
93 BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm));
94 virt_to_page(pgd)->index = (pgoff_t)mm;
95}
96
97struct mm_struct *pgd_page_get_mm(struct page *page)
98{
99 return (struct mm_struct *)page->index;
100}
101
102static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
91{ 103{
92 /* If the pgd points to a shared pagetable level (either the 104 /* If the pgd points to a shared pagetable level (either the
93 ptes in non-PAE, or shared PMD in PAE), then just copy the 105 ptes in non-PAE, or shared PMD in PAE), then just copy the
@@ -98,15 +110,13 @@ static void pgd_ctor(pgd_t *pgd)
98 clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, 110 clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
99 swapper_pg_dir + KERNEL_PGD_BOUNDARY, 111 swapper_pg_dir + KERNEL_PGD_BOUNDARY,
100 KERNEL_PGD_PTRS); 112 KERNEL_PGD_PTRS);
101 paravirt_alloc_pmd_clone(__pa(pgd) >> PAGE_SHIFT,
102 __pa(swapper_pg_dir) >> PAGE_SHIFT,
103 KERNEL_PGD_BOUNDARY,
104 KERNEL_PGD_PTRS);
105 } 113 }
106 114
107 /* list required to sync kernel mapping updates */ 115 /* list required to sync kernel mapping updates */
108 if (!SHARED_KERNEL_PMD) 116 if (!SHARED_KERNEL_PMD) {
117 pgd_set_mm(pgd, mm);
109 pgd_list_add(pgd); 118 pgd_list_add(pgd);
119 }
110} 120}
111 121
112static void pgd_dtor(pgd_t *pgd) 122static void pgd_dtor(pgd_t *pgd)
@@ -272,7 +282,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
272 */ 282 */
273 spin_lock_irqsave(&pgd_lock, flags); 283 spin_lock_irqsave(&pgd_lock, flags);
274 284
275 pgd_ctor(pgd); 285 pgd_ctor(mm, pgd);
276 pgd_prepopulate_pmd(mm, pgd, pmds); 286 pgd_prepopulate_pmd(mm, pgd, pmds);
277 287
278 spin_unlock_irqrestore(&pgd_lock, flags); 288 spin_unlock_irqrestore(&pgd_lock, flags);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index c03f14ab6667..49358481c733 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -5,6 +5,7 @@
5#include <linux/smp.h> 5#include <linux/smp.h>
6#include <linux/interrupt.h> 6#include <linux/interrupt.h>
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/cpu.h>
8 9
9#include <asm/tlbflush.h> 10#include <asm/tlbflush.h>
10#include <asm/mmu_context.h> 11#include <asm/mmu_context.h>
@@ -52,6 +53,8 @@ union smp_flush_state {
52 want false sharing in the per cpu data segment. */ 53 want false sharing in the per cpu data segment. */
53static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS]; 54static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS];
54 55
56static DEFINE_PER_CPU_READ_MOSTLY(int, tlb_vector_offset);
57
55/* 58/*
56 * We cannot call mmdrop() because we are in interrupt context, 59 * We cannot call mmdrop() because we are in interrupt context,
57 * instead update mm->cpu_vm_mask. 60 * instead update mm->cpu_vm_mask.
@@ -173,7 +176,7 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
173 union smp_flush_state *f; 176 union smp_flush_state *f;
174 177
175 /* Caller has disabled preemption */ 178 /* Caller has disabled preemption */
176 sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS; 179 sender = this_cpu_read(tlb_vector_offset);
177 f = &flush_state[sender]; 180 f = &flush_state[sender];
178 181
179 /* 182 /*
@@ -218,6 +221,47 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
218 flush_tlb_others_ipi(cpumask, mm, va); 221 flush_tlb_others_ipi(cpumask, mm, va);
219} 222}
220 223
224static void __cpuinit calculate_tlb_offset(void)
225{
226 int cpu, node, nr_node_vecs;
227 /*
228 * we are changing tlb_vector_offset for each CPU in runtime, but this
229 * will not cause inconsistency, as the write is atomic under X86. we
230 * might see more lock contentions in a short time, but after all CPU's
231 * tlb_vector_offset are changed, everything should go normal
232 *
233 * Note: if NUM_INVALIDATE_TLB_VECTORS % nr_online_nodes !=0, we might
234 * waste some vectors.
235 **/
236 if (nr_online_nodes > NUM_INVALIDATE_TLB_VECTORS)
237 nr_node_vecs = 1;
238 else
239 nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes;
240
241 for_each_online_node(node) {
242 int node_offset = (node % NUM_INVALIDATE_TLB_VECTORS) *
243 nr_node_vecs;
244 int cpu_offset = 0;
245 for_each_cpu(cpu, cpumask_of_node(node)) {
246 per_cpu(tlb_vector_offset, cpu) = node_offset +
247 cpu_offset;
248 cpu_offset++;
249 cpu_offset = cpu_offset % nr_node_vecs;
250 }
251 }
252}
253
254static int tlb_cpuhp_notify(struct notifier_block *n,
255 unsigned long action, void *hcpu)
256{
257 switch (action & 0xf) {
258 case CPU_ONLINE:
259 case CPU_DEAD:
260 calculate_tlb_offset();
261 }
262 return NOTIFY_OK;
263}
264
221static int __cpuinit init_smp_flush(void) 265static int __cpuinit init_smp_flush(void)
222{ 266{
223 int i; 267 int i;
@@ -225,6 +269,8 @@ static int __cpuinit init_smp_flush(void)
225 for (i = 0; i < ARRAY_SIZE(flush_state); i++) 269 for (i = 0; i < ARRAY_SIZE(flush_state); i++)
226 raw_spin_lock_init(&flush_state[i].tlbstate_lock); 270 raw_spin_lock_init(&flush_state[i].tlbstate_lock);
227 271
272 calculate_tlb_offset();
273 hotcpu_notifier(tlb_cpuhp_notify, 0);
228 return 0; 274 return 0;
229} 275}
230core_initcall(init_smp_flush); 276core_initcall(init_smp_flush);
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index b67a6b5aa8d4..42fb46f83883 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -64,15 +64,22 @@ static u64 ibs_op_ctl;
64 * IBS cpuid feature detection 64 * IBS cpuid feature detection
65 */ 65 */
66 66
67#define IBS_CPUID_FEATURES 0x8000001b 67#define IBS_CPUID_FEATURES 0x8000001b
68 68
69/* 69/*
70 * Same bit mask as for IBS cpuid feature flags (Fn8000_001B_EAX), but 70 * Same bit mask as for IBS cpuid feature flags (Fn8000_001B_EAX), but
71 * bit 0 is used to indicate the existence of IBS. 71 * bit 0 is used to indicate the existence of IBS.
72 */ 72 */
73#define IBS_CAPS_AVAIL (1LL<<0) 73#define IBS_CAPS_AVAIL (1U<<0)
74#define IBS_CAPS_RDWROPCNT (1LL<<3) 74#define IBS_CAPS_RDWROPCNT (1U<<3)
75#define IBS_CAPS_OPCNT (1LL<<4) 75#define IBS_CAPS_OPCNT (1U<<4)
76
77/*
78 * IBS APIC setup
79 */
80#define IBSCTL 0x1cc
81#define IBSCTL_LVT_OFFSET_VALID (1ULL<<8)
82#define IBSCTL_LVT_OFFSET_MASK 0x0F
76 83
77/* 84/*
78 * IBS randomization macros 85 * IBS randomization macros
@@ -266,6 +273,74 @@ static void op_amd_stop_ibs(void)
266 wrmsrl(MSR_AMD64_IBSOPCTL, 0); 273 wrmsrl(MSR_AMD64_IBSOPCTL, 0);
267} 274}
268 275
276static inline int eilvt_is_available(int offset)
277{
278 /* check if we may assign a vector */
279 return !setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 1);
280}
281
282static inline int ibs_eilvt_valid(void)
283{
284 u64 val;
285 int offset;
286
287 rdmsrl(MSR_AMD64_IBSCTL, val);
288 if (!(val & IBSCTL_LVT_OFFSET_VALID)) {
289 pr_err(FW_BUG "cpu %d, invalid IBS "
290 "interrupt offset %d (MSR%08X=0x%016llx)",
291 smp_processor_id(), offset,
292 MSR_AMD64_IBSCTL, val);
293 return 0;
294 }
295
296 offset = val & IBSCTL_LVT_OFFSET_MASK;
297
298 if (eilvt_is_available(offset))
299 return !0;
300
301 pr_err(FW_BUG "cpu %d, IBS interrupt offset %d "
302 "not available (MSR%08X=0x%016llx)",
303 smp_processor_id(), offset,
304 MSR_AMD64_IBSCTL, val);
305
306 return 0;
307}
308
309static inline int get_ibs_offset(void)
310{
311 u64 val;
312
313 rdmsrl(MSR_AMD64_IBSCTL, val);
314 if (!(val & IBSCTL_LVT_OFFSET_VALID))
315 return -EINVAL;
316
317 return val & IBSCTL_LVT_OFFSET_MASK;
318}
319
320static void setup_APIC_ibs(void)
321{
322 int offset;
323
324 offset = get_ibs_offset();
325 if (offset < 0)
326 goto failed;
327
328 if (!setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 0))
329 return;
330failed:
331 pr_warn("oprofile: IBS APIC setup failed on cpu #%d\n",
332 smp_processor_id());
333}
334
335static void clear_APIC_ibs(void)
336{
337 int offset;
338
339 offset = get_ibs_offset();
340 if (offset >= 0)
341 setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1);
342}
343
269#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX 344#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
270 345
271static void op_mux_switch_ctrl(struct op_x86_model_spec const *model, 346static void op_mux_switch_ctrl(struct op_x86_model_spec const *model,
@@ -376,13 +451,13 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
376 } 451 }
377 452
378 if (ibs_caps) 453 if (ibs_caps)
379 setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_NMI, 0); 454 setup_APIC_ibs();
380} 455}
381 456
382static void op_amd_cpu_shutdown(void) 457static void op_amd_cpu_shutdown(void)
383{ 458{
384 if (ibs_caps) 459 if (ibs_caps)
385 setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1); 460 clear_APIC_ibs();
386} 461}
387 462
388static int op_amd_check_ctrs(struct pt_regs * const regs, 463static int op_amd_check_ctrs(struct pt_regs * const regs,
@@ -445,16 +520,11 @@ static void op_amd_stop(struct op_msrs const * const msrs)
445 op_amd_stop_ibs(); 520 op_amd_stop_ibs();
446} 521}
447 522
448static int __init_ibs_nmi(void) 523static int setup_ibs_ctl(int ibs_eilvt_off)
449{ 524{
450#define IBSCTL_LVTOFFSETVAL (1 << 8)
451#define IBSCTL 0x1cc
452 struct pci_dev *cpu_cfg; 525 struct pci_dev *cpu_cfg;
453 int nodes; 526 int nodes;
454 u32 value = 0; 527 u32 value = 0;
455 u8 ibs_eilvt_off;
456
457 ibs_eilvt_off = setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1);
458 528
459 nodes = 0; 529 nodes = 0;
460 cpu_cfg = NULL; 530 cpu_cfg = NULL;
@@ -466,21 +536,60 @@ static int __init_ibs_nmi(void)
466 break; 536 break;
467 ++nodes; 537 ++nodes;
468 pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off 538 pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off
469 | IBSCTL_LVTOFFSETVAL); 539 | IBSCTL_LVT_OFFSET_VALID);
470 pci_read_config_dword(cpu_cfg, IBSCTL, &value); 540 pci_read_config_dword(cpu_cfg, IBSCTL, &value);
471 if (value != (ibs_eilvt_off | IBSCTL_LVTOFFSETVAL)) { 541 if (value != (ibs_eilvt_off | IBSCTL_LVT_OFFSET_VALID)) {
472 pci_dev_put(cpu_cfg); 542 pci_dev_put(cpu_cfg);
473 printk(KERN_DEBUG "Failed to setup IBS LVT offset, " 543 printk(KERN_DEBUG "Failed to setup IBS LVT offset, "
474 "IBSCTL = 0x%08x", value); 544 "IBSCTL = 0x%08x\n", value);
475 return 1; 545 return -EINVAL;
476 } 546 }
477 } while (1); 547 } while (1);
478 548
479 if (!nodes) { 549 if (!nodes) {
480 printk(KERN_DEBUG "No CPU node configured for IBS"); 550 printk(KERN_DEBUG "No CPU node configured for IBS\n");
481 return 1; 551 return -ENODEV;
552 }
553
554 return 0;
555}
556
557static int force_ibs_eilvt_setup(void)
558{
559 int i;
560 int ret;
561
562 /* find the next free available EILVT entry */
563 for (i = 1; i < 4; i++) {
564 if (!eilvt_is_available(i))
565 continue;
566 ret = setup_ibs_ctl(i);
567 if (ret)
568 return ret;
569 return 0;
482 } 570 }
483 571
572 printk(KERN_DEBUG "No EILVT entry available\n");
573
574 return -EBUSY;
575}
576
577static int __init_ibs_nmi(void)
578{
579 int ret;
580
581 if (ibs_eilvt_valid())
582 return 0;
583
584 ret = force_ibs_eilvt_setup();
585 if (ret)
586 return ret;
587
588 if (!ibs_eilvt_valid())
589 return -EFAULT;
590
591 pr_err(FW_BUG "workaround enabled for IBS LVT offset\n");
592
484 return 0; 593 return 0;
485} 594}
486 595
diff --git a/arch/x86/pci/olpc.c b/arch/x86/pci/olpc.c
index b34815408f58..13700ec8e2e4 100644
--- a/arch/x86/pci/olpc.c
+++ b/arch/x86/pci/olpc.c
@@ -304,7 +304,7 @@ static struct pci_raw_ops pci_olpc_conf = {
304 304
305int __init pci_olpc_init(void) 305int __init pci_olpc_init(void)
306{ 306{
307 printk(KERN_INFO "PCI: Using configuration type OLPC\n"); 307 printk(KERN_INFO "PCI: Using configuration type OLPC XO-1\n");
308 raw_pci_ops = &pci_olpc_conf; 308 raw_pci_ops = &pci_olpc_conf;
309 is_lx = is_geode_lx(); 309 is_lx = is_geode_lx();
310 return 0; 310 return 0;
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 42086ac406af..b2363fcbcd0f 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1969,7 +1969,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1969 .alloc_pte = xen_alloc_pte_init, 1969 .alloc_pte = xen_alloc_pte_init,
1970 .release_pte = xen_release_pte_init, 1970 .release_pte = xen_release_pte_init,
1971 .alloc_pmd = xen_alloc_pmd_init, 1971 .alloc_pmd = xen_alloc_pmd_init,
1972 .alloc_pmd_clone = paravirt_nop,
1973 .release_pmd = xen_release_pmd_init, 1972 .release_pmd = xen_release_pmd_init,
1974 1973
1975#ifdef CONFIG_X86_64 1974#ifdef CONFIG_X86_64
diff --git a/arch/xtensa/kernel/irq.c b/arch/xtensa/kernel/irq.c
index c64a5d387de5..87508886cbbd 100644
--- a/arch/xtensa/kernel/irq.c
+++ b/arch/xtensa/kernel/irq.c
@@ -92,7 +92,7 @@ int show_interrupts(struct seq_file *p, void *v)
92 for_each_online_cpu(j) 92 for_each_online_cpu(j)
93 seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); 93 seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
94#endif 94#endif
95 seq_printf(p, " %14s", irq_desc[i].chip->typename); 95 seq_printf(p, " %14s", irq_desc[i].chip->name);
96 seq_printf(p, " %s", action->name); 96 seq_printf(p, " %s", action->name);
97 97
98 for (action=action->next; action; action = action->next) 98 for (action=action->next; action; action = action->next)
diff --git a/drivers/acpi/acpi_pad.c b/drivers/acpi/acpi_pad.c
index 6b115f6c4313..6afceb3d4034 100644
--- a/drivers/acpi/acpi_pad.c
+++ b/drivers/acpi/acpi_pad.c
@@ -30,18 +30,13 @@
30#include <linux/slab.h> 30#include <linux/slab.h>
31#include <acpi/acpi_bus.h> 31#include <acpi/acpi_bus.h>
32#include <acpi/acpi_drivers.h> 32#include <acpi/acpi_drivers.h>
33#include <asm/mwait.h>
33 34
34#define ACPI_PROCESSOR_AGGREGATOR_CLASS "acpi_pad" 35#define ACPI_PROCESSOR_AGGREGATOR_CLASS "acpi_pad"
35#define ACPI_PROCESSOR_AGGREGATOR_DEVICE_NAME "Processor Aggregator" 36#define ACPI_PROCESSOR_AGGREGATOR_DEVICE_NAME "Processor Aggregator"
36#define ACPI_PROCESSOR_AGGREGATOR_NOTIFY 0x80 37#define ACPI_PROCESSOR_AGGREGATOR_NOTIFY 0x80
37static DEFINE_MUTEX(isolated_cpus_lock); 38static DEFINE_MUTEX(isolated_cpus_lock);
38 39
39#define MWAIT_SUBSTATE_MASK (0xf)
40#define MWAIT_CSTATE_MASK (0xf)
41#define MWAIT_SUBSTATE_SIZE (4)
42#define CPUID_MWAIT_LEAF (5)
43#define CPUID5_ECX_EXTENSIONS_SUPPORTED (0x1)
44#define CPUID5_ECX_INTERRUPT_BREAK (0x2)
45static unsigned long power_saving_mwait_eax; 40static unsigned long power_saving_mwait_eax;
46 41
47static unsigned char tsc_detected_unstable; 42static unsigned char tsc_detected_unstable;
diff --git a/drivers/base/topology.c b/drivers/base/topology.c
index 9fc630ce1ddb..f6f37a05a0c3 100644
--- a/drivers/base/topology.c
+++ b/drivers/base/topology.c
@@ -45,7 +45,8 @@ static ssize_t show_##name(struct sys_device *dev, \
45 return sprintf(buf, "%d\n", topology_##name(cpu)); \ 45 return sprintf(buf, "%d\n", topology_##name(cpu)); \
46} 46}
47 47
48#if defined(topology_thread_cpumask) || defined(topology_core_cpumask) 48#if defined(topology_thread_cpumask) || defined(topology_core_cpumask) || \
49 defined(topology_book_cpumask)
49static ssize_t show_cpumap(int type, const struct cpumask *mask, char *buf) 50static ssize_t show_cpumap(int type, const struct cpumask *mask, char *buf)
50{ 51{
51 ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf; 52 ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf;
@@ -114,6 +115,14 @@ define_siblings_show_func(core_cpumask);
114define_one_ro_named(core_siblings, show_core_cpumask); 115define_one_ro_named(core_siblings, show_core_cpumask);
115define_one_ro_named(core_siblings_list, show_core_cpumask_list); 116define_one_ro_named(core_siblings_list, show_core_cpumask_list);
116 117
118#ifdef CONFIG_SCHED_BOOK
119define_id_show_func(book_id);
120define_one_ro(book_id);
121define_siblings_show_func(book_cpumask);
122define_one_ro_named(book_siblings, show_book_cpumask);
123define_one_ro_named(book_siblings_list, show_book_cpumask_list);
124#endif
125
117static struct attribute *default_attrs[] = { 126static struct attribute *default_attrs[] = {
118 &attr_physical_package_id.attr, 127 &attr_physical_package_id.attr,
119 &attr_core_id.attr, 128 &attr_core_id.attr,
@@ -121,6 +130,11 @@ static struct attribute *default_attrs[] = {
121 &attr_thread_siblings_list.attr, 130 &attr_thread_siblings_list.attr,
122 &attr_core_siblings.attr, 131 &attr_core_siblings.attr,
123 &attr_core_siblings_list.attr, 132 &attr_core_siblings_list.attr,
133#ifdef CONFIG_SCHED_BOOK
134 &attr_book_id.attr,
135 &attr_book_siblings.attr,
136 &attr_book_siblings_list.attr,
137#endif
124 NULL 138 NULL
125}; 139};
126 140
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index de277689da61..4b9359a6f6ca 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -488,4 +488,21 @@ config BLK_DEV_HD
488 488
489 If unsure, say N. 489 If unsure, say N.
490 490
491config BLK_DEV_RBD
492 tristate "Rados block device (RBD)"
493 depends on INET && EXPERIMENTAL && BLOCK
494 select CEPH_LIB
495 select LIBCRC32C
496 select CRYPTO_AES
497 select CRYPTO
498 default n
499 help
500 Say Y here if you want include the Rados block device, which stripes
501 a block device over objects stored in the Ceph distributed object
502 store.
503
504 More information at http://ceph.newdream.net/.
505
506 If unsure, say N.
507
491endif # BLK_DEV 508endif # BLK_DEV
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index aff5ac925c34..d7f463d6312d 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -37,5 +37,6 @@ obj-$(CONFIG_BLK_DEV_HD) += hd.o
37 37
38obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o 38obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o
39obj-$(CONFIG_BLK_DEV_DRBD) += drbd/ 39obj-$(CONFIG_BLK_DEV_DRBD) += drbd/
40obj-$(CONFIG_BLK_DEV_RBD) += rbd.o
40 41
41swim_mod-objs := swim.o swim_asm.o 42swim_mod-objs := swim.o swim_asm.o
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
new file mode 100644
index 000000000000..6ec9d53806c5
--- /dev/null
+++ b/drivers/block/rbd.c
@@ -0,0 +1,1841 @@
1/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
24 Instructions for use
25 --------------------
26
27 1) Map a Linux block device to an existing rbd image.
28
29 Usage: <mon ip addr> <options> <pool name> <rbd image name> [snap name]
30
31 $ echo "192.168.0.1 name=admin rbd foo" > /sys/class/rbd/add
32
33 The snapshot name can be "-" or omitted to map the image read/write.
34
35 2) List all active blkdev<->object mappings.
36
37 In this example, we have performed step #1 twice, creating two blkdevs,
38 mapped to two separate rados objects in the rados rbd pool
39
40 $ cat /sys/class/rbd/list
41 #id major client_name pool name snap KB
42 0 254 client4143 rbd foo - 1024000
43
44 The columns, in order, are:
45 - blkdev unique id
46 - blkdev assigned major
47 - rados client id
48 - rados pool name
49 - rados block device name
50 - mapped snapshot ("-" if none)
51 - device size in KB
52
53
54 3) Create a snapshot.
55
56 Usage: <blkdev id> <snapname>
57
58 $ echo "0 mysnap" > /sys/class/rbd/snap_create
59
60
61 4) Listing a snapshot.
62
63 $ cat /sys/class/rbd/snaps_list
64 #id snap KB
65 0 - 1024000 (*)
66 0 foo 1024000
67
68 The columns, in order, are:
69 - blkdev unique id
70 - snapshot name, '-' means none (active read/write version)
71 - size of device at time of snapshot
72 - the (*) indicates this is the active version
73
74 5) Rollback to snapshot.
75
76 Usage: <blkdev id> <snapname>
77
78 $ echo "0 mysnap" > /sys/class/rbd/snap_rollback
79
80
81 6) Mapping an image using snapshot.
82
83 A snapshot mapping is read-only. This is being done by passing
84 snap=<snapname> to the options when adding a device.
85
86 $ echo "192.168.0.1 name=admin,snap=mysnap rbd foo" > /sys/class/rbd/add
87
88
89 7) Remove an active blkdev<->rbd image mapping.
90
91 In this example, we remove the mapping with blkdev unique id 1.
92
93 $ echo 1 > /sys/class/rbd/remove
94
95
96 NOTE: The actual creation and deletion of rados objects is outside the scope
97 of this driver.
98
99 */
100
101#include <linux/ceph/libceph.h>
102#include <linux/ceph/osd_client.h>
103#include <linux/ceph/mon_client.h>
104#include <linux/ceph/decode.h>
105
106#include <linux/kernel.h>
107#include <linux/device.h>
108#include <linux/module.h>
109#include <linux/fs.h>
110#include <linux/blkdev.h>
111
112#include "rbd_types.h"
113
114#define DRV_NAME "rbd"
115#define DRV_NAME_LONG "rbd (rados block device)"
116
117#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
118
119#define RBD_MAX_MD_NAME_LEN (96 + sizeof(RBD_SUFFIX))
120#define RBD_MAX_POOL_NAME_LEN 64
121#define RBD_MAX_SNAP_NAME_LEN 32
122#define RBD_MAX_OPT_LEN 1024
123
124#define RBD_SNAP_HEAD_NAME "-"
125
126#define DEV_NAME_LEN 32
127
128/*
129 * block device image metadata (in-memory version)
130 */
131struct rbd_image_header {
132 u64 image_size;
133 char block_name[32];
134 __u8 obj_order;
135 __u8 crypt_type;
136 __u8 comp_type;
137 struct rw_semaphore snap_rwsem;
138 struct ceph_snap_context *snapc;
139 size_t snap_names_len;
140 u64 snap_seq;
141 u32 total_snaps;
142
143 char *snap_names;
144 u64 *snap_sizes;
145};
146
147/*
148 * an instance of the client. multiple devices may share a client.
149 */
150struct rbd_client {
151 struct ceph_client *client;
152 struct kref kref;
153 struct list_head node;
154};
155
156/*
157 * a single io request
158 */
159struct rbd_request {
160 struct request *rq; /* blk layer request */
161 struct bio *bio; /* cloned bio */
162 struct page **pages; /* list of used pages */
163 u64 len;
164};
165
166/*
167 * a single device
168 */
169struct rbd_device {
170 int id; /* blkdev unique id */
171
172 int major; /* blkdev assigned major */
173 struct gendisk *disk; /* blkdev's gendisk and rq */
174 struct request_queue *q;
175
176 struct ceph_client *client;
177 struct rbd_client *rbd_client;
178
179 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
180
181 spinlock_t lock; /* queue lock */
182
183 struct rbd_image_header header;
184 char obj[RBD_MAX_OBJ_NAME_LEN]; /* rbd image name */
185 int obj_len;
186 char obj_md_name[RBD_MAX_MD_NAME_LEN]; /* hdr nm. */
187 char pool_name[RBD_MAX_POOL_NAME_LEN];
188 int poolid;
189
190 char snap_name[RBD_MAX_SNAP_NAME_LEN];
191 u32 cur_snap; /* index+1 of current snapshot within snap context
192 0 - for the head */
193 int read_only;
194
195 struct list_head node;
196};
197
198static spinlock_t node_lock; /* protects client get/put */
199
200static struct class *class_rbd; /* /sys/class/rbd */
201static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
202static LIST_HEAD(rbd_dev_list); /* devices */
203static LIST_HEAD(rbd_client_list); /* clients */
204
205
206static int rbd_open(struct block_device *bdev, fmode_t mode)
207{
208 struct gendisk *disk = bdev->bd_disk;
209 struct rbd_device *rbd_dev = disk->private_data;
210
211 set_device_ro(bdev, rbd_dev->read_only);
212
213 if ((mode & FMODE_WRITE) && rbd_dev->read_only)
214 return -EROFS;
215
216 return 0;
217}
218
219static const struct block_device_operations rbd_bd_ops = {
220 .owner = THIS_MODULE,
221 .open = rbd_open,
222};
223
224/*
225 * Initialize an rbd client instance.
226 * We own *opt.
227 */
228static struct rbd_client *rbd_client_create(struct ceph_options *opt)
229{
230 struct rbd_client *rbdc;
231 int ret = -ENOMEM;
232
233 dout("rbd_client_create\n");
234 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
235 if (!rbdc)
236 goto out_opt;
237
238 kref_init(&rbdc->kref);
239 INIT_LIST_HEAD(&rbdc->node);
240
241 rbdc->client = ceph_create_client(opt, rbdc);
242 if (IS_ERR(rbdc->client))
243 goto out_rbdc;
244 opt = NULL; /* Now rbdc->client is responsible for opt */
245
246 ret = ceph_open_session(rbdc->client);
247 if (ret < 0)
248 goto out_err;
249
250 spin_lock(&node_lock);
251 list_add_tail(&rbdc->node, &rbd_client_list);
252 spin_unlock(&node_lock);
253
254 dout("rbd_client_create created %p\n", rbdc);
255 return rbdc;
256
257out_err:
258 ceph_destroy_client(rbdc->client);
259out_rbdc:
260 kfree(rbdc);
261out_opt:
262 if (opt)
263 ceph_destroy_options(opt);
264 return ERR_PTR(ret);
265}
266
267/*
268 * Find a ceph client with specific addr and configuration.
269 */
270static struct rbd_client *__rbd_client_find(struct ceph_options *opt)
271{
272 struct rbd_client *client_node;
273
274 if (opt->flags & CEPH_OPT_NOSHARE)
275 return NULL;
276
277 list_for_each_entry(client_node, &rbd_client_list, node)
278 if (ceph_compare_options(opt, client_node->client) == 0)
279 return client_node;
280 return NULL;
281}
282
283/*
284 * Get a ceph client with specific addr and configuration, if one does
285 * not exist create it.
286 */
287static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
288 char *options)
289{
290 struct rbd_client *rbdc;
291 struct ceph_options *opt;
292 int ret;
293
294 ret = ceph_parse_options(&opt, options, mon_addr,
295 mon_addr + strlen(mon_addr), NULL, NULL);
296 if (ret < 0)
297 return ret;
298
299 spin_lock(&node_lock);
300 rbdc = __rbd_client_find(opt);
301 if (rbdc) {
302 ceph_destroy_options(opt);
303
304 /* using an existing client */
305 kref_get(&rbdc->kref);
306 rbd_dev->rbd_client = rbdc;
307 rbd_dev->client = rbdc->client;
308 spin_unlock(&node_lock);
309 return 0;
310 }
311 spin_unlock(&node_lock);
312
313 rbdc = rbd_client_create(opt);
314 if (IS_ERR(rbdc))
315 return PTR_ERR(rbdc);
316
317 rbd_dev->rbd_client = rbdc;
318 rbd_dev->client = rbdc->client;
319 return 0;
320}
321
322/*
323 * Destroy ceph client
324 */
325static void rbd_client_release(struct kref *kref)
326{
327 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
328
329 dout("rbd_release_client %p\n", rbdc);
330 spin_lock(&node_lock);
331 list_del(&rbdc->node);
332 spin_unlock(&node_lock);
333
334 ceph_destroy_client(rbdc->client);
335 kfree(rbdc);
336}
337
338/*
339 * Drop reference to ceph client node. If it's not referenced anymore, release
340 * it.
341 */
342static void rbd_put_client(struct rbd_device *rbd_dev)
343{
344 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
345 rbd_dev->rbd_client = NULL;
346 rbd_dev->client = NULL;
347}
348
349
350/*
351 * Create a new header structure, translate header format from the on-disk
352 * header.
353 */
354static int rbd_header_from_disk(struct rbd_image_header *header,
355 struct rbd_image_header_ondisk *ondisk,
356 int allocated_snaps,
357 gfp_t gfp_flags)
358{
359 int i;
360 u32 snap_count = le32_to_cpu(ondisk->snap_count);
361 int ret = -ENOMEM;
362
363 init_rwsem(&header->snap_rwsem);
364
365 header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
366 header->snapc = kmalloc(sizeof(struct ceph_snap_context) +
367 snap_count *
368 sizeof(struct rbd_image_snap_ondisk),
369 gfp_flags);
370 if (!header->snapc)
371 return -ENOMEM;
372 if (snap_count) {
373 header->snap_names = kmalloc(header->snap_names_len,
374 GFP_KERNEL);
375 if (!header->snap_names)
376 goto err_snapc;
377 header->snap_sizes = kmalloc(snap_count * sizeof(u64),
378 GFP_KERNEL);
379 if (!header->snap_sizes)
380 goto err_names;
381 } else {
382 header->snap_names = NULL;
383 header->snap_sizes = NULL;
384 }
385 memcpy(header->block_name, ondisk->block_name,
386 sizeof(ondisk->block_name));
387
388 header->image_size = le64_to_cpu(ondisk->image_size);
389 header->obj_order = ondisk->options.order;
390 header->crypt_type = ondisk->options.crypt_type;
391 header->comp_type = ondisk->options.comp_type;
392
393 atomic_set(&header->snapc->nref, 1);
394 header->snap_seq = le64_to_cpu(ondisk->snap_seq);
395 header->snapc->num_snaps = snap_count;
396 header->total_snaps = snap_count;
397
398 if (snap_count &&
399 allocated_snaps == snap_count) {
400 for (i = 0; i < snap_count; i++) {
401 header->snapc->snaps[i] =
402 le64_to_cpu(ondisk->snaps[i].id);
403 header->snap_sizes[i] =
404 le64_to_cpu(ondisk->snaps[i].image_size);
405 }
406
407 /* copy snapshot names */
408 memcpy(header->snap_names, &ondisk->snaps[i],
409 header->snap_names_len);
410 }
411
412 return 0;
413
414err_names:
415 kfree(header->snap_names);
416err_snapc:
417 kfree(header->snapc);
418 return ret;
419}
420
421static int snap_index(struct rbd_image_header *header, int snap_num)
422{
423 return header->total_snaps - snap_num;
424}
425
426static u64 cur_snap_id(struct rbd_device *rbd_dev)
427{
428 struct rbd_image_header *header = &rbd_dev->header;
429
430 if (!rbd_dev->cur_snap)
431 return 0;
432
433 return header->snapc->snaps[snap_index(header, rbd_dev->cur_snap)];
434}
435
436static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
437 u64 *seq, u64 *size)
438{
439 int i;
440 char *p = header->snap_names;
441
442 for (i = 0; i < header->total_snaps; i++, p += strlen(p) + 1) {
443 if (strcmp(snap_name, p) == 0)
444 break;
445 }
446 if (i == header->total_snaps)
447 return -ENOENT;
448 if (seq)
449 *seq = header->snapc->snaps[i];
450
451 if (size)
452 *size = header->snap_sizes[i];
453
454 return i;
455}
456
457static int rbd_header_set_snap(struct rbd_device *dev,
458 const char *snap_name,
459 u64 *size)
460{
461 struct rbd_image_header *header = &dev->header;
462 struct ceph_snap_context *snapc = header->snapc;
463 int ret = -ENOENT;
464
465 down_write(&header->snap_rwsem);
466
467 if (!snap_name ||
468 !*snap_name ||
469 strcmp(snap_name, "-") == 0 ||
470 strcmp(snap_name, RBD_SNAP_HEAD_NAME) == 0) {
471 if (header->total_snaps)
472 snapc->seq = header->snap_seq;
473 else
474 snapc->seq = 0;
475 dev->cur_snap = 0;
476 dev->read_only = 0;
477 if (size)
478 *size = header->image_size;
479 } else {
480 ret = snap_by_name(header, snap_name, &snapc->seq, size);
481 if (ret < 0)
482 goto done;
483
484 dev->cur_snap = header->total_snaps - ret;
485 dev->read_only = 1;
486 }
487
488 ret = 0;
489done:
490 up_write(&header->snap_rwsem);
491 return ret;
492}
493
494static void rbd_header_free(struct rbd_image_header *header)
495{
496 kfree(header->snapc);
497 kfree(header->snap_names);
498 kfree(header->snap_sizes);
499}
500
501/*
502 * get the actual striped segment name, offset and length
503 */
504static u64 rbd_get_segment(struct rbd_image_header *header,
505 const char *block_name,
506 u64 ofs, u64 len,
507 char *seg_name, u64 *segofs)
508{
509 u64 seg = ofs >> header->obj_order;
510
511 if (seg_name)
512 snprintf(seg_name, RBD_MAX_SEG_NAME_LEN,
513 "%s.%012llx", block_name, seg);
514
515 ofs = ofs & ((1 << header->obj_order) - 1);
516 len = min_t(u64, len, (1 << header->obj_order) - ofs);
517
518 if (segofs)
519 *segofs = ofs;
520
521 return len;
522}
523
524/*
525 * bio helpers
526 */
527
528static void bio_chain_put(struct bio *chain)
529{
530 struct bio *tmp;
531
532 while (chain) {
533 tmp = chain;
534 chain = chain->bi_next;
535 bio_put(tmp);
536 }
537}
538
539/*
540 * zeros a bio chain, starting at specific offset
541 */
542static void zero_bio_chain(struct bio *chain, int start_ofs)
543{
544 struct bio_vec *bv;
545 unsigned long flags;
546 void *buf;
547 int i;
548 int pos = 0;
549
550 while (chain) {
551 bio_for_each_segment(bv, chain, i) {
552 if (pos + bv->bv_len > start_ofs) {
553 int remainder = max(start_ofs - pos, 0);
554 buf = bvec_kmap_irq(bv, &flags);
555 memset(buf + remainder, 0,
556 bv->bv_len - remainder);
557 bvec_kunmap_irq(buf, &flags);
558 }
559 pos += bv->bv_len;
560 }
561
562 chain = chain->bi_next;
563 }
564}
565
566/*
567 * bio_chain_clone - clone a chain of bios up to a certain length.
568 * might return a bio_pair that will need to be released.
569 */
570static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
571 struct bio_pair **bp,
572 int len, gfp_t gfpmask)
573{
574 struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL;
575 int total = 0;
576
577 if (*bp) {
578 bio_pair_release(*bp);
579 *bp = NULL;
580 }
581
582 while (old_chain && (total < len)) {
583 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
584 if (!tmp)
585 goto err_out;
586
587 if (total + old_chain->bi_size > len) {
588 struct bio_pair *bp;
589
590 /*
591 * this split can only happen with a single paged bio,
592 * split_bio will BUG_ON if this is not the case
593 */
594 dout("bio_chain_clone split! total=%d remaining=%d"
595 "bi_size=%d\n",
596 (int)total, (int)len-total,
597 (int)old_chain->bi_size);
598
599 /* split the bio. We'll release it either in the next
600 call, or it will have to be released outside */
601 bp = bio_split(old_chain, (len - total) / 512ULL);
602 if (!bp)
603 goto err_out;
604
605 __bio_clone(tmp, &bp->bio1);
606
607 *next = &bp->bio2;
608 } else {
609 __bio_clone(tmp, old_chain);
610 *next = old_chain->bi_next;
611 }
612
613 tmp->bi_bdev = NULL;
614 gfpmask &= ~__GFP_WAIT;
615 tmp->bi_next = NULL;
616
617 if (!new_chain) {
618 new_chain = tail = tmp;
619 } else {
620 tail->bi_next = tmp;
621 tail = tmp;
622 }
623 old_chain = old_chain->bi_next;
624
625 total += tmp->bi_size;
626 }
627
628 BUG_ON(total < len);
629
630 if (tail)
631 tail->bi_next = NULL;
632
633 *old = old_chain;
634
635 return new_chain;
636
637err_out:
638 dout("bio_chain_clone with err\n");
639 bio_chain_put(new_chain);
640 return NULL;
641}
642
643/*
644 * helpers for osd request op vectors.
645 */
646static int rbd_create_rw_ops(struct ceph_osd_req_op **ops,
647 int num_ops,
648 int opcode,
649 u32 payload_len)
650{
651 *ops = kzalloc(sizeof(struct ceph_osd_req_op) * (num_ops + 1),
652 GFP_NOIO);
653 if (!*ops)
654 return -ENOMEM;
655 (*ops)[0].op = opcode;
656 /*
657 * op extent offset and length will be set later on
658 * in calc_raw_layout()
659 */
660 (*ops)[0].payload_len = payload_len;
661 return 0;
662}
663
664static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
665{
666 kfree(ops);
667}
668
669/*
670 * Send ceph osd request
671 */
672static int rbd_do_request(struct request *rq,
673 struct rbd_device *dev,
674 struct ceph_snap_context *snapc,
675 u64 snapid,
676 const char *obj, u64 ofs, u64 len,
677 struct bio *bio,
678 struct page **pages,
679 int num_pages,
680 int flags,
681 struct ceph_osd_req_op *ops,
682 int num_reply,
683 void (*rbd_cb)(struct ceph_osd_request *req,
684 struct ceph_msg *msg))
685{
686 struct ceph_osd_request *req;
687 struct ceph_file_layout *layout;
688 int ret;
689 u64 bno;
690 struct timespec mtime = CURRENT_TIME;
691 struct rbd_request *req_data;
692 struct ceph_osd_request_head *reqhead;
693 struct rbd_image_header *header = &dev->header;
694
695 ret = -ENOMEM;
696 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
697 if (!req_data)
698 goto done;
699
700 dout("rbd_do_request len=%lld ofs=%lld\n", len, ofs);
701
702 down_read(&header->snap_rwsem);
703
704 req = ceph_osdc_alloc_request(&dev->client->osdc, flags,
705 snapc,
706 ops,
707 false,
708 GFP_NOIO, pages, bio);
709 if (IS_ERR(req)) {
710 up_read(&header->snap_rwsem);
711 ret = PTR_ERR(req);
712 goto done_pages;
713 }
714
715 req->r_callback = rbd_cb;
716
717 req_data->rq = rq;
718 req_data->bio = bio;
719 req_data->pages = pages;
720 req_data->len = len;
721
722 req->r_priv = req_data;
723
724 reqhead = req->r_request->front.iov_base;
725 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
726
727 strncpy(req->r_oid, obj, sizeof(req->r_oid));
728 req->r_oid_len = strlen(req->r_oid);
729
730 layout = &req->r_file_layout;
731 memset(layout, 0, sizeof(*layout));
732 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
733 layout->fl_stripe_count = cpu_to_le32(1);
734 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
735 layout->fl_pg_preferred = cpu_to_le32(-1);
736 layout->fl_pg_pool = cpu_to_le32(dev->poolid);
737 ceph_calc_raw_layout(&dev->client->osdc, layout, snapid,
738 ofs, &len, &bno, req, ops);
739
740 ceph_osdc_build_request(req, ofs, &len,
741 ops,
742 snapc,
743 &mtime,
744 req->r_oid, req->r_oid_len);
745 up_read(&header->snap_rwsem);
746
747 ret = ceph_osdc_start_request(&dev->client->osdc, req, false);
748 if (ret < 0)
749 goto done_err;
750
751 if (!rbd_cb) {
752 ret = ceph_osdc_wait_request(&dev->client->osdc, req);
753 ceph_osdc_put_request(req);
754 }
755 return ret;
756
757done_err:
758 bio_chain_put(req_data->bio);
759 ceph_osdc_put_request(req);
760done_pages:
761 kfree(req_data);
762done:
763 if (rq)
764 blk_end_request(rq, ret, len);
765 return ret;
766}
767
768/*
769 * Ceph osd op callback
770 */
771static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
772{
773 struct rbd_request *req_data = req->r_priv;
774 struct ceph_osd_reply_head *replyhead;
775 struct ceph_osd_op *op;
776 __s32 rc;
777 u64 bytes;
778 int read_op;
779
780 /* parse reply */
781 replyhead = msg->front.iov_base;
782 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
783 op = (void *)(replyhead + 1);
784 rc = le32_to_cpu(replyhead->result);
785 bytes = le64_to_cpu(op->extent.length);
786 read_op = (le32_to_cpu(op->op) == CEPH_OSD_OP_READ);
787
788 dout("rbd_req_cb bytes=%lld readop=%d rc=%d\n", bytes, read_op, rc);
789
790 if (rc == -ENOENT && read_op) {
791 zero_bio_chain(req_data->bio, 0);
792 rc = 0;
793 } else if (rc == 0 && read_op && bytes < req_data->len) {
794 zero_bio_chain(req_data->bio, bytes);
795 bytes = req_data->len;
796 }
797
798 blk_end_request(req_data->rq, rc, bytes);
799
800 if (req_data->bio)
801 bio_chain_put(req_data->bio);
802
803 ceph_osdc_put_request(req);
804 kfree(req_data);
805}
806
807/*
808 * Do a synchronous ceph osd operation
809 */
810static int rbd_req_sync_op(struct rbd_device *dev,
811 struct ceph_snap_context *snapc,
812 u64 snapid,
813 int opcode,
814 int flags,
815 struct ceph_osd_req_op *orig_ops,
816 int num_reply,
817 const char *obj,
818 u64 ofs, u64 len,
819 char *buf)
820{
821 int ret;
822 struct page **pages;
823 int num_pages;
824 struct ceph_osd_req_op *ops = orig_ops;
825 u32 payload_len;
826
827 num_pages = calc_pages_for(ofs , len);
828 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
829 if (IS_ERR(pages))
830 return PTR_ERR(pages);
831
832 if (!orig_ops) {
833 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? len : 0);
834 ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
835 if (ret < 0)
836 goto done;
837
838 if ((flags & CEPH_OSD_FLAG_WRITE) && buf) {
839 ret = ceph_copy_to_page_vector(pages, buf, ofs, len);
840 if (ret < 0)
841 goto done_ops;
842 }
843 }
844
845 ret = rbd_do_request(NULL, dev, snapc, snapid,
846 obj, ofs, len, NULL,
847 pages, num_pages,
848 flags,
849 ops,
850 2,
851 NULL);
852 if (ret < 0)
853 goto done_ops;
854
855 if ((flags & CEPH_OSD_FLAG_READ) && buf)
856 ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
857
858done_ops:
859 if (!orig_ops)
860 rbd_destroy_ops(ops);
861done:
862 ceph_release_page_vector(pages, num_pages);
863 return ret;
864}
865
866/*
867 * Do an asynchronous ceph osd operation
868 */
869static int rbd_do_op(struct request *rq,
870 struct rbd_device *rbd_dev ,
871 struct ceph_snap_context *snapc,
872 u64 snapid,
873 int opcode, int flags, int num_reply,
874 u64 ofs, u64 len,
875 struct bio *bio)
876{
877 char *seg_name;
878 u64 seg_ofs;
879 u64 seg_len;
880 int ret;
881 struct ceph_osd_req_op *ops;
882 u32 payload_len;
883
884 seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
885 if (!seg_name)
886 return -ENOMEM;
887
888 seg_len = rbd_get_segment(&rbd_dev->header,
889 rbd_dev->header.block_name,
890 ofs, len,
891 seg_name, &seg_ofs);
892
893 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
894
895 ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
896 if (ret < 0)
897 goto done;
898
899 /* we've taken care of segment sizes earlier when we
900 cloned the bios. We should never have a segment
901 truncated at this point */
902 BUG_ON(seg_len < len);
903
904 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
905 seg_name, seg_ofs, seg_len,
906 bio,
907 NULL, 0,
908 flags,
909 ops,
910 num_reply,
911 rbd_req_cb);
912done:
913 kfree(seg_name);
914 return ret;
915}
916
917/*
918 * Request async osd write
919 */
920static int rbd_req_write(struct request *rq,
921 struct rbd_device *rbd_dev,
922 struct ceph_snap_context *snapc,
923 u64 ofs, u64 len,
924 struct bio *bio)
925{
926 return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
927 CEPH_OSD_OP_WRITE,
928 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
929 2,
930 ofs, len, bio);
931}
932
933/*
934 * Request async osd read
935 */
936static int rbd_req_read(struct request *rq,
937 struct rbd_device *rbd_dev,
938 u64 snapid,
939 u64 ofs, u64 len,
940 struct bio *bio)
941{
942 return rbd_do_op(rq, rbd_dev, NULL,
943 (snapid ? snapid : CEPH_NOSNAP),
944 CEPH_OSD_OP_READ,
945 CEPH_OSD_FLAG_READ,
946 2,
947 ofs, len, bio);
948}
949
950/*
951 * Request sync osd read
952 */
953static int rbd_req_sync_read(struct rbd_device *dev,
954 struct ceph_snap_context *snapc,
955 u64 snapid,
956 const char *obj,
957 u64 ofs, u64 len,
958 char *buf)
959{
960 return rbd_req_sync_op(dev, NULL,
961 (snapid ? snapid : CEPH_NOSNAP),
962 CEPH_OSD_OP_READ,
963 CEPH_OSD_FLAG_READ,
964 NULL,
965 1, obj, ofs, len, buf);
966}
967
968/*
969 * Request sync osd read
970 */
971static int rbd_req_sync_rollback_obj(struct rbd_device *dev,
972 u64 snapid,
973 const char *obj)
974{
975 struct ceph_osd_req_op *ops;
976 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_ROLLBACK, 0);
977 if (ret < 0)
978 return ret;
979
980 ops[0].snap.snapid = snapid;
981
982 ret = rbd_req_sync_op(dev, NULL,
983 CEPH_NOSNAP,
984 0,
985 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
986 ops,
987 1, obj, 0, 0, NULL);
988
989 rbd_destroy_ops(ops);
990
991 if (ret < 0)
992 return ret;
993
994 return ret;
995}
996
997/*
998 * Request sync osd read
999 */
1000static int rbd_req_sync_exec(struct rbd_device *dev,
1001 const char *obj,
1002 const char *cls,
1003 const char *method,
1004 const char *data,
1005 int len)
1006{
1007 struct ceph_osd_req_op *ops;
1008 int cls_len = strlen(cls);
1009 int method_len = strlen(method);
1010 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_CALL,
1011 cls_len + method_len + len);
1012 if (ret < 0)
1013 return ret;
1014
1015 ops[0].cls.class_name = cls;
1016 ops[0].cls.class_len = (__u8)cls_len;
1017 ops[0].cls.method_name = method;
1018 ops[0].cls.method_len = (__u8)method_len;
1019 ops[0].cls.argc = 0;
1020 ops[0].cls.indata = data;
1021 ops[0].cls.indata_len = len;
1022
1023 ret = rbd_req_sync_op(dev, NULL,
1024 CEPH_NOSNAP,
1025 0,
1026 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1027 ops,
1028 1, obj, 0, 0, NULL);
1029
1030 rbd_destroy_ops(ops);
1031
1032 dout("cls_exec returned %d\n", ret);
1033 return ret;
1034}
1035
1036/*
1037 * block device queue callback
1038 */
1039static void rbd_rq_fn(struct request_queue *q)
1040{
1041 struct rbd_device *rbd_dev = q->queuedata;
1042 struct request *rq;
1043 struct bio_pair *bp = NULL;
1044
1045 rq = blk_fetch_request(q);
1046
1047 while (1) {
1048 struct bio *bio;
1049 struct bio *rq_bio, *next_bio = NULL;
1050 bool do_write;
1051 int size, op_size = 0;
1052 u64 ofs;
1053
1054 /* peek at request from block layer */
1055 if (!rq)
1056 break;
1057
1058 dout("fetched request\n");
1059
1060 /* filter out block requests we don't understand */
1061 if ((rq->cmd_type != REQ_TYPE_FS)) {
1062 __blk_end_request_all(rq, 0);
1063 goto next;
1064 }
1065
1066 /* deduce our operation (read, write) */
1067 do_write = (rq_data_dir(rq) == WRITE);
1068
1069 size = blk_rq_bytes(rq);
1070 ofs = blk_rq_pos(rq) * 512ULL;
1071 rq_bio = rq->bio;
1072 if (do_write && rbd_dev->read_only) {
1073 __blk_end_request_all(rq, -EROFS);
1074 goto next;
1075 }
1076
1077 spin_unlock_irq(q->queue_lock);
1078
1079 dout("%s 0x%x bytes at 0x%llx\n",
1080 do_write ? "write" : "read",
1081 size, blk_rq_pos(rq) * 512ULL);
1082
1083 do {
1084 /* a bio clone to be passed down to OSD req */
1085 dout("rq->bio->bi_vcnt=%d\n", rq->bio->bi_vcnt);
1086 op_size = rbd_get_segment(&rbd_dev->header,
1087 rbd_dev->header.block_name,
1088 ofs, size,
1089 NULL, NULL);
1090 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1091 op_size, GFP_ATOMIC);
1092 if (!bio) {
1093 spin_lock_irq(q->queue_lock);
1094 __blk_end_request_all(rq, -ENOMEM);
1095 goto next;
1096 }
1097
1098 /* init OSD command: write or read */
1099 if (do_write)
1100 rbd_req_write(rq, rbd_dev,
1101 rbd_dev->header.snapc,
1102 ofs,
1103 op_size, bio);
1104 else
1105 rbd_req_read(rq, rbd_dev,
1106 cur_snap_id(rbd_dev),
1107 ofs,
1108 op_size, bio);
1109
1110 size -= op_size;
1111 ofs += op_size;
1112
1113 rq_bio = next_bio;
1114 } while (size > 0);
1115
1116 if (bp)
1117 bio_pair_release(bp);
1118
1119 spin_lock_irq(q->queue_lock);
1120next:
1121 rq = blk_fetch_request(q);
1122 }
1123}
1124
1125/*
1126 * a queue callback. Makes sure that we don't create a bio that spans across
1127 * multiple osd objects. One exception would be with a single page bios,
1128 * which we handle later at bio_chain_clone
1129 */
1130static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1131 struct bio_vec *bvec)
1132{
1133 struct rbd_device *rbd_dev = q->queuedata;
1134 unsigned int chunk_sectors = 1 << (rbd_dev->header.obj_order - 9);
1135 sector_t sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1136 unsigned int bio_sectors = bmd->bi_size >> 9;
1137 int max;
1138
1139 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
1140 + bio_sectors)) << 9;
1141 if (max < 0)
1142 max = 0; /* bio_add cannot handle a negative return */
1143 if (max <= bvec->bv_len && bio_sectors == 0)
1144 return bvec->bv_len;
1145 return max;
1146}
1147
1148static void rbd_free_disk(struct rbd_device *rbd_dev)
1149{
1150 struct gendisk *disk = rbd_dev->disk;
1151
1152 if (!disk)
1153 return;
1154
1155 rbd_header_free(&rbd_dev->header);
1156
1157 if (disk->flags & GENHD_FL_UP)
1158 del_gendisk(disk);
1159 if (disk->queue)
1160 blk_cleanup_queue(disk->queue);
1161 put_disk(disk);
1162}
1163
1164/*
1165 * reload the ondisk the header
1166 */
1167static int rbd_read_header(struct rbd_device *rbd_dev,
1168 struct rbd_image_header *header)
1169{
1170 ssize_t rc;
1171 struct rbd_image_header_ondisk *dh;
1172 int snap_count = 0;
1173 u64 snap_names_len = 0;
1174
1175 while (1) {
1176 int len = sizeof(*dh) +
1177 snap_count * sizeof(struct rbd_image_snap_ondisk) +
1178 snap_names_len;
1179
1180 rc = -ENOMEM;
1181 dh = kmalloc(len, GFP_KERNEL);
1182 if (!dh)
1183 return -ENOMEM;
1184
1185 rc = rbd_req_sync_read(rbd_dev,
1186 NULL, CEPH_NOSNAP,
1187 rbd_dev->obj_md_name,
1188 0, len,
1189 (char *)dh);
1190 if (rc < 0)
1191 goto out_dh;
1192
1193 rc = rbd_header_from_disk(header, dh, snap_count, GFP_KERNEL);
1194 if (rc < 0)
1195 goto out_dh;
1196
1197 if (snap_count != header->total_snaps) {
1198 snap_count = header->total_snaps;
1199 snap_names_len = header->snap_names_len;
1200 rbd_header_free(header);
1201 kfree(dh);
1202 continue;
1203 }
1204 break;
1205 }
1206
1207out_dh:
1208 kfree(dh);
1209 return rc;
1210}
1211
1212/*
1213 * create a snapshot
1214 */
1215static int rbd_header_add_snap(struct rbd_device *dev,
1216 const char *snap_name,
1217 gfp_t gfp_flags)
1218{
1219 int name_len = strlen(snap_name);
1220 u64 new_snapid;
1221 int ret;
1222 void *data, *data_start, *data_end;
1223
1224 /* we should create a snapshot only if we're pointing at the head */
1225 if (dev->cur_snap)
1226 return -EINVAL;
1227
1228 ret = ceph_monc_create_snapid(&dev->client->monc, dev->poolid,
1229 &new_snapid);
1230 dout("created snapid=%lld\n", new_snapid);
1231 if (ret < 0)
1232 return ret;
1233
1234 data = kmalloc(name_len + 16, gfp_flags);
1235 if (!data)
1236 return -ENOMEM;
1237
1238 data_start = data;
1239 data_end = data + name_len + 16;
1240
1241 ceph_encode_string_safe(&data, data_end, snap_name, name_len, bad);
1242 ceph_encode_64_safe(&data, data_end, new_snapid, bad);
1243
1244 ret = rbd_req_sync_exec(dev, dev->obj_md_name, "rbd", "snap_add",
1245 data_start, data - data_start);
1246
1247 kfree(data_start);
1248
1249 if (ret < 0)
1250 return ret;
1251
1252 dev->header.snapc->seq = new_snapid;
1253
1254 return 0;
1255bad:
1256 return -ERANGE;
1257}
1258
1259/*
1260 * only read the first part of the ondisk header, without the snaps info
1261 */
1262static int rbd_update_snaps(struct rbd_device *rbd_dev)
1263{
1264 int ret;
1265 struct rbd_image_header h;
1266 u64 snap_seq;
1267
1268 ret = rbd_read_header(rbd_dev, &h);
1269 if (ret < 0)
1270 return ret;
1271
1272 down_write(&rbd_dev->header.snap_rwsem);
1273
1274 snap_seq = rbd_dev->header.snapc->seq;
1275
1276 kfree(rbd_dev->header.snapc);
1277 kfree(rbd_dev->header.snap_names);
1278 kfree(rbd_dev->header.snap_sizes);
1279
1280 rbd_dev->header.total_snaps = h.total_snaps;
1281 rbd_dev->header.snapc = h.snapc;
1282 rbd_dev->header.snap_names = h.snap_names;
1283 rbd_dev->header.snap_sizes = h.snap_sizes;
1284 rbd_dev->header.snapc->seq = snap_seq;
1285
1286 up_write(&rbd_dev->header.snap_rwsem);
1287
1288 return 0;
1289}
1290
1291static int rbd_init_disk(struct rbd_device *rbd_dev)
1292{
1293 struct gendisk *disk;
1294 struct request_queue *q;
1295 int rc;
1296 u64 total_size = 0;
1297
1298 /* contact OSD, request size info about the object being mapped */
1299 rc = rbd_read_header(rbd_dev, &rbd_dev->header);
1300 if (rc)
1301 return rc;
1302
1303 rc = rbd_header_set_snap(rbd_dev, rbd_dev->snap_name, &total_size);
1304 if (rc)
1305 return rc;
1306
1307 /* create gendisk info */
1308 rc = -ENOMEM;
1309 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1310 if (!disk)
1311 goto out;
1312
1313 sprintf(disk->disk_name, DRV_NAME "%d", rbd_dev->id);
1314 disk->major = rbd_dev->major;
1315 disk->first_minor = 0;
1316 disk->fops = &rbd_bd_ops;
1317 disk->private_data = rbd_dev;
1318
1319 /* init rq */
1320 rc = -ENOMEM;
1321 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1322 if (!q)
1323 goto out_disk;
1324 blk_queue_merge_bvec(q, rbd_merge_bvec);
1325 disk->queue = q;
1326
1327 q->queuedata = rbd_dev;
1328
1329 rbd_dev->disk = disk;
1330 rbd_dev->q = q;
1331
1332 /* finally, announce the disk to the world */
1333 set_capacity(disk, total_size / 512ULL);
1334 add_disk(disk);
1335
1336 pr_info("%s: added with size 0x%llx\n",
1337 disk->disk_name, (unsigned long long)total_size);
1338 return 0;
1339
1340out_disk:
1341 put_disk(disk);
1342out:
1343 return rc;
1344}
1345
1346/********************************************************************
1347 * /sys/class/rbd/
1348 * add map rados objects to blkdev
1349 * remove unmap rados objects
1350 * list show mappings
1351 *******************************************************************/
1352
1353static void class_rbd_release(struct class *cls)
1354{
1355 kfree(cls);
1356}
1357
1358static ssize_t class_rbd_list(struct class *c,
1359 struct class_attribute *attr,
1360 char *data)
1361{
1362 int n = 0;
1363 struct list_head *tmp;
1364 int max = PAGE_SIZE;
1365
1366 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1367
1368 n += snprintf(data, max,
1369 "#id\tmajor\tclient_name\tpool\tname\tsnap\tKB\n");
1370
1371 list_for_each(tmp, &rbd_dev_list) {
1372 struct rbd_device *rbd_dev;
1373
1374 rbd_dev = list_entry(tmp, struct rbd_device, node);
1375 n += snprintf(data+n, max-n,
1376 "%d\t%d\tclient%lld\t%s\t%s\t%s\t%lld\n",
1377 rbd_dev->id,
1378 rbd_dev->major,
1379 ceph_client_id(rbd_dev->client),
1380 rbd_dev->pool_name,
1381 rbd_dev->obj, rbd_dev->snap_name,
1382 rbd_dev->header.image_size >> 10);
1383 if (n == max)
1384 break;
1385 }
1386
1387 mutex_unlock(&ctl_mutex);
1388 return n;
1389}
1390
1391static ssize_t class_rbd_add(struct class *c,
1392 struct class_attribute *attr,
1393 const char *buf, size_t count)
1394{
1395 struct ceph_osd_client *osdc;
1396 struct rbd_device *rbd_dev;
1397 ssize_t rc = -ENOMEM;
1398 int irc, new_id = 0;
1399 struct list_head *tmp;
1400 char *mon_dev_name;
1401 char *options;
1402
1403 if (!try_module_get(THIS_MODULE))
1404 return -ENODEV;
1405
1406 mon_dev_name = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL);
1407 if (!mon_dev_name)
1408 goto err_out_mod;
1409
1410 options = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL);
1411 if (!options)
1412 goto err_mon_dev;
1413
1414 /* new rbd_device object */
1415 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
1416 if (!rbd_dev)
1417 goto err_out_opt;
1418
1419 /* static rbd_device initialization */
1420 spin_lock_init(&rbd_dev->lock);
1421 INIT_LIST_HEAD(&rbd_dev->node);
1422
1423 /* generate unique id: find highest unique id, add one */
1424 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1425
1426 list_for_each(tmp, &rbd_dev_list) {
1427 struct rbd_device *rbd_dev;
1428
1429 rbd_dev = list_entry(tmp, struct rbd_device, node);
1430 if (rbd_dev->id >= new_id)
1431 new_id = rbd_dev->id + 1;
1432 }
1433
1434 rbd_dev->id = new_id;
1435
1436 /* add to global list */
1437 list_add_tail(&rbd_dev->node, &rbd_dev_list);
1438
1439 /* parse add command */
1440 if (sscanf(buf, "%" __stringify(RBD_MAX_OPT_LEN) "s "
1441 "%" __stringify(RBD_MAX_OPT_LEN) "s "
1442 "%" __stringify(RBD_MAX_POOL_NAME_LEN) "s "
1443 "%" __stringify(RBD_MAX_OBJ_NAME_LEN) "s"
1444 "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s",
1445 mon_dev_name, options, rbd_dev->pool_name,
1446 rbd_dev->obj, rbd_dev->snap_name) < 4) {
1447 rc = -EINVAL;
1448 goto err_out_slot;
1449 }
1450
1451 if (rbd_dev->snap_name[0] == 0)
1452 rbd_dev->snap_name[0] = '-';
1453
1454 rbd_dev->obj_len = strlen(rbd_dev->obj);
1455 snprintf(rbd_dev->obj_md_name, sizeof(rbd_dev->obj_md_name), "%s%s",
1456 rbd_dev->obj, RBD_SUFFIX);
1457
1458 /* initialize rest of new object */
1459 snprintf(rbd_dev->name, DEV_NAME_LEN, DRV_NAME "%d", rbd_dev->id);
1460 rc = rbd_get_client(rbd_dev, mon_dev_name, options);
1461 if (rc < 0)
1462 goto err_out_slot;
1463
1464 mutex_unlock(&ctl_mutex);
1465
1466 /* pick the pool */
1467 osdc = &rbd_dev->client->osdc;
1468 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
1469 if (rc < 0)
1470 goto err_out_client;
1471 rbd_dev->poolid = rc;
1472
1473 /* register our block device */
1474 irc = register_blkdev(0, rbd_dev->name);
1475 if (irc < 0) {
1476 rc = irc;
1477 goto err_out_client;
1478 }
1479 rbd_dev->major = irc;
1480
1481 /* set up and announce blkdev mapping */
1482 rc = rbd_init_disk(rbd_dev);
1483 if (rc)
1484 goto err_out_blkdev;
1485
1486 return count;
1487
1488err_out_blkdev:
1489 unregister_blkdev(rbd_dev->major, rbd_dev->name);
1490err_out_client:
1491 rbd_put_client(rbd_dev);
1492 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1493err_out_slot:
1494 list_del_init(&rbd_dev->node);
1495 mutex_unlock(&ctl_mutex);
1496
1497 kfree(rbd_dev);
1498err_out_opt:
1499 kfree(options);
1500err_mon_dev:
1501 kfree(mon_dev_name);
1502err_out_mod:
1503 dout("Error adding device %s\n", buf);
1504 module_put(THIS_MODULE);
1505 return rc;
1506}
1507
1508static struct rbd_device *__rbd_get_dev(unsigned long id)
1509{
1510 struct list_head *tmp;
1511 struct rbd_device *rbd_dev;
1512
1513 list_for_each(tmp, &rbd_dev_list) {
1514 rbd_dev = list_entry(tmp, struct rbd_device, node);
1515 if (rbd_dev->id == id)
1516 return rbd_dev;
1517 }
1518 return NULL;
1519}
1520
1521static ssize_t class_rbd_remove(struct class *c,
1522 struct class_attribute *attr,
1523 const char *buf,
1524 size_t count)
1525{
1526 struct rbd_device *rbd_dev = NULL;
1527 int target_id, rc;
1528 unsigned long ul;
1529
1530 rc = strict_strtoul(buf, 10, &ul);
1531 if (rc)
1532 return rc;
1533
1534 /* convert to int; abort if we lost anything in the conversion */
1535 target_id = (int) ul;
1536 if (target_id != ul)
1537 return -EINVAL;
1538
1539 /* remove object from list immediately */
1540 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1541
1542 rbd_dev = __rbd_get_dev(target_id);
1543 if (rbd_dev)
1544 list_del_init(&rbd_dev->node);
1545
1546 mutex_unlock(&ctl_mutex);
1547
1548 if (!rbd_dev)
1549 return -ENOENT;
1550
1551 rbd_put_client(rbd_dev);
1552
1553 /* clean up and free blkdev */
1554 rbd_free_disk(rbd_dev);
1555 unregister_blkdev(rbd_dev->major, rbd_dev->name);
1556 kfree(rbd_dev);
1557
1558 /* release module ref */
1559 module_put(THIS_MODULE);
1560
1561 return count;
1562}
1563
1564static ssize_t class_rbd_snaps_list(struct class *c,
1565 struct class_attribute *attr,
1566 char *data)
1567{
1568 struct rbd_device *rbd_dev = NULL;
1569 struct list_head *tmp;
1570 struct rbd_image_header *header;
1571 int i, n = 0, max = PAGE_SIZE;
1572 int ret;
1573
1574 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1575
1576 n += snprintf(data, max, "#id\tsnap\tKB\n");
1577
1578 list_for_each(tmp, &rbd_dev_list) {
1579 char *names, *p;
1580 struct ceph_snap_context *snapc;
1581
1582 rbd_dev = list_entry(tmp, struct rbd_device, node);
1583 header = &rbd_dev->header;
1584
1585 down_read(&header->snap_rwsem);
1586
1587 names = header->snap_names;
1588 snapc = header->snapc;
1589
1590 n += snprintf(data + n, max - n, "%d\t%s\t%lld%s\n",
1591 rbd_dev->id, RBD_SNAP_HEAD_NAME,
1592 header->image_size >> 10,
1593 (!rbd_dev->cur_snap ? " (*)" : ""));
1594 if (n == max)
1595 break;
1596
1597 p = names;
1598 for (i = 0; i < header->total_snaps; i++, p += strlen(p) + 1) {
1599 n += snprintf(data + n, max - n, "%d\t%s\t%lld%s\n",
1600 rbd_dev->id, p, header->snap_sizes[i] >> 10,
1601 (rbd_dev->cur_snap &&
1602 (snap_index(header, i) == rbd_dev->cur_snap) ?
1603 " (*)" : ""));
1604 if (n == max)
1605 break;
1606 }
1607
1608 up_read(&header->snap_rwsem);
1609 }
1610
1611
1612 ret = n;
1613 mutex_unlock(&ctl_mutex);
1614 return ret;
1615}
1616
1617static ssize_t class_rbd_snaps_refresh(struct class *c,
1618 struct class_attribute *attr,
1619 const char *buf,
1620 size_t count)
1621{
1622 struct rbd_device *rbd_dev = NULL;
1623 int target_id, rc;
1624 unsigned long ul;
1625 int ret = count;
1626
1627 rc = strict_strtoul(buf, 10, &ul);
1628 if (rc)
1629 return rc;
1630
1631 /* convert to int; abort if we lost anything in the conversion */
1632 target_id = (int) ul;
1633 if (target_id != ul)
1634 return -EINVAL;
1635
1636 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1637
1638 rbd_dev = __rbd_get_dev(target_id);
1639 if (!rbd_dev) {
1640 ret = -ENOENT;
1641 goto done;
1642 }
1643
1644 rc = rbd_update_snaps(rbd_dev);
1645 if (rc < 0)
1646 ret = rc;
1647
1648done:
1649 mutex_unlock(&ctl_mutex);
1650 return ret;
1651}
1652
1653static ssize_t class_rbd_snap_create(struct class *c,
1654 struct class_attribute *attr,
1655 const char *buf,
1656 size_t count)
1657{
1658 struct rbd_device *rbd_dev = NULL;
1659 int target_id, ret;
1660 char *name;
1661
1662 name = kmalloc(RBD_MAX_SNAP_NAME_LEN + 1, GFP_KERNEL);
1663 if (!name)
1664 return -ENOMEM;
1665
1666 /* parse snaps add command */
1667 if (sscanf(buf, "%d "
1668 "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s",
1669 &target_id,
1670 name) != 2) {
1671 ret = -EINVAL;
1672 goto done;
1673 }
1674
1675 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1676
1677 rbd_dev = __rbd_get_dev(target_id);
1678 if (!rbd_dev) {
1679 ret = -ENOENT;
1680 goto done_unlock;
1681 }
1682
1683 ret = rbd_header_add_snap(rbd_dev,
1684 name, GFP_KERNEL);
1685 if (ret < 0)
1686 goto done_unlock;
1687
1688 ret = rbd_update_snaps(rbd_dev);
1689 if (ret < 0)
1690 goto done_unlock;
1691
1692 ret = count;
1693done_unlock:
1694 mutex_unlock(&ctl_mutex);
1695done:
1696 kfree(name);
1697 return ret;
1698}
1699
1700static ssize_t class_rbd_rollback(struct class *c,
1701 struct class_attribute *attr,
1702 const char *buf,
1703 size_t count)
1704{
1705 struct rbd_device *rbd_dev = NULL;
1706 int target_id, ret;
1707 u64 snapid;
1708 char snap_name[RBD_MAX_SNAP_NAME_LEN];
1709 u64 cur_ofs;
1710 char *seg_name;
1711
1712 /* parse snaps add command */
1713 if (sscanf(buf, "%d "
1714 "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s",
1715 &target_id,
1716 snap_name) != 2) {
1717 return -EINVAL;
1718 }
1719
1720 ret = -ENOMEM;
1721 seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
1722 if (!seg_name)
1723 return ret;
1724
1725 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1726
1727 rbd_dev = __rbd_get_dev(target_id);
1728 if (!rbd_dev) {
1729 ret = -ENOENT;
1730 goto done_unlock;
1731 }
1732
1733 ret = snap_by_name(&rbd_dev->header, snap_name, &snapid, NULL);
1734 if (ret < 0)
1735 goto done_unlock;
1736
1737 dout("snapid=%lld\n", snapid);
1738
1739 cur_ofs = 0;
1740 while (cur_ofs < rbd_dev->header.image_size) {
1741 cur_ofs += rbd_get_segment(&rbd_dev->header,
1742 rbd_dev->obj,
1743 cur_ofs, (u64)-1,
1744 seg_name, NULL);
1745 dout("seg_name=%s\n", seg_name);
1746
1747 ret = rbd_req_sync_rollback_obj(rbd_dev, snapid, seg_name);
1748 if (ret < 0)
1749 pr_warning("could not roll back obj %s err=%d\n",
1750 seg_name, ret);
1751 }
1752
1753 ret = rbd_update_snaps(rbd_dev);
1754 if (ret < 0)
1755 goto done_unlock;
1756
1757 ret = count;
1758
1759done_unlock:
1760 mutex_unlock(&ctl_mutex);
1761 kfree(seg_name);
1762
1763 return ret;
1764}
1765
1766static struct class_attribute class_rbd_attrs[] = {
1767 __ATTR(add, 0200, NULL, class_rbd_add),
1768 __ATTR(remove, 0200, NULL, class_rbd_remove),
1769 __ATTR(list, 0444, class_rbd_list, NULL),
1770 __ATTR(snaps_refresh, 0200, NULL, class_rbd_snaps_refresh),
1771 __ATTR(snap_create, 0200, NULL, class_rbd_snap_create),
1772 __ATTR(snaps_list, 0444, class_rbd_snaps_list, NULL),
1773 __ATTR(snap_rollback, 0200, NULL, class_rbd_rollback),
1774 __ATTR_NULL
1775};
1776
1777/*
1778 * create control files in sysfs
1779 * /sys/class/rbd/...
1780 */
1781static int rbd_sysfs_init(void)
1782{
1783 int ret = -ENOMEM;
1784
1785 class_rbd = kzalloc(sizeof(*class_rbd), GFP_KERNEL);
1786 if (!class_rbd)
1787 goto out;
1788
1789 class_rbd->name = DRV_NAME;
1790 class_rbd->owner = THIS_MODULE;
1791 class_rbd->class_release = class_rbd_release;
1792 class_rbd->class_attrs = class_rbd_attrs;
1793
1794 ret = class_register(class_rbd);
1795 if (ret)
1796 goto out_class;
1797 return 0;
1798
1799out_class:
1800 kfree(class_rbd);
1801 class_rbd = NULL;
1802 pr_err(DRV_NAME ": failed to create class rbd\n");
1803out:
1804 return ret;
1805}
1806
1807static void rbd_sysfs_cleanup(void)
1808{
1809 if (class_rbd)
1810 class_destroy(class_rbd);
1811 class_rbd = NULL;
1812}
1813
1814int __init rbd_init(void)
1815{
1816 int rc;
1817
1818 rc = rbd_sysfs_init();
1819 if (rc)
1820 return rc;
1821 spin_lock_init(&node_lock);
1822 pr_info("loaded " DRV_NAME_LONG "\n");
1823 return 0;
1824}
1825
1826void __exit rbd_exit(void)
1827{
1828 rbd_sysfs_cleanup();
1829}
1830
1831module_init(rbd_init);
1832module_exit(rbd_exit);
1833
1834MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
1835MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
1836MODULE_DESCRIPTION("rados block device");
1837
1838/* following authorship retained from original osdblk.c */
1839MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
1840
1841MODULE_LICENSE("GPL");
diff --git a/drivers/block/rbd_types.h b/drivers/block/rbd_types.h
new file mode 100644
index 000000000000..fc6c678aa2cb
--- /dev/null
+++ b/drivers/block/rbd_types.h
@@ -0,0 +1,73 @@
1/*
2 * Ceph - scalable distributed file system
3 *
4 * Copyright (C) 2004-2010 Sage Weil <sage@newdream.net>
5 *
6 * This is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License version 2.1, as published by the Free Software
9 * Foundation. See file COPYING.
10 *
11 */
12
13#ifndef CEPH_RBD_TYPES_H
14#define CEPH_RBD_TYPES_H
15
16#include <linux/types.h>
17
18/*
19 * rbd image 'foo' consists of objects
20 * foo.rbd - image metadata
21 * foo.00000000
22 * foo.00000001
23 * ... - data
24 */
25
26#define RBD_SUFFIX ".rbd"
27#define RBD_DIRECTORY "rbd_directory"
28#define RBD_INFO "rbd_info"
29
30#define RBD_DEFAULT_OBJ_ORDER 22 /* 4MB */
31#define RBD_MIN_OBJ_ORDER 16
32#define RBD_MAX_OBJ_ORDER 30
33
34#define RBD_MAX_OBJ_NAME_LEN 96
35#define RBD_MAX_SEG_NAME_LEN 128
36
37#define RBD_COMP_NONE 0
38#define RBD_CRYPT_NONE 0
39
40#define RBD_HEADER_TEXT "<<< Rados Block Device Image >>>\n"
41#define RBD_HEADER_SIGNATURE "RBD"
42#define RBD_HEADER_VERSION "001.005"
43
44struct rbd_info {
45 __le64 max_id;
46} __attribute__ ((packed));
47
48struct rbd_image_snap_ondisk {
49 __le64 id;
50 __le64 image_size;
51} __attribute__((packed));
52
53struct rbd_image_header_ondisk {
54 char text[40];
55 char block_name[24];
56 char signature[4];
57 char version[8];
58 struct {
59 __u8 order;
60 __u8 crypt_type;
61 __u8 comp_type;
62 __u8 unused;
63 } __attribute__((packed)) options;
64 __le64 image_size;
65 __le64 snap_seq;
66 __le32 snap_count;
67 __le32 reserved;
68 __le64 snap_names_len;
69 struct rbd_image_snap_ondisk snaps[0];
70} __attribute__((packed));
71
72
73#endif
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 1101e251a629..8320490226b7 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -2,7 +2,6 @@
2#include <linux/spinlock.h> 2#include <linux/spinlock.h>
3#include <linux/slab.h> 3#include <linux/slab.h>
4#include <linux/blkdev.h> 4#include <linux/blkdev.h>
5#include <linux/smp_lock.h>
6#include <linux/hdreg.h> 5#include <linux/hdreg.h>
7#include <linux/virtio.h> 6#include <linux/virtio.h>
8#include <linux/virtio_blk.h> 7#include <linux/virtio_blk.h>
@@ -222,8 +221,8 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str)
222 return err; 221 return err;
223} 222}
224 223
225static int virtblk_locked_ioctl(struct block_device *bdev, fmode_t mode, 224static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
226 unsigned cmd, unsigned long data) 225 unsigned int cmd, unsigned long data)
227{ 226{
228 struct gendisk *disk = bdev->bd_disk; 227 struct gendisk *disk = bdev->bd_disk;
229 struct virtio_blk *vblk = disk->private_data; 228 struct virtio_blk *vblk = disk->private_data;
@@ -238,18 +237,6 @@ static int virtblk_locked_ioctl(struct block_device *bdev, fmode_t mode,
238 (void __user *)data); 237 (void __user *)data);
239} 238}
240 239
241static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
242 unsigned int cmd, unsigned long param)
243{
244 int ret;
245
246 lock_kernel();
247 ret = virtblk_locked_ioctl(bdev, mode, cmd, param);
248 unlock_kernel();
249
250 return ret;
251}
252
253/* We provide getgeo only to please some old bootloader/partitioning tools */ 240/* We provide getgeo only to please some old bootloader/partitioning tools */
254static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo) 241static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo)
255{ 242{
diff --git a/drivers/char/agp/Kconfig b/drivers/char/agp/Kconfig
index 4b66c69eaf57..5ddf67e76f8b 100644
--- a/drivers/char/agp/Kconfig
+++ b/drivers/char/agp/Kconfig
@@ -57,7 +57,7 @@ config AGP_AMD
57 57
58config AGP_AMD64 58config AGP_AMD64
59 tristate "AMD Opteron/Athlon64 on-CPU GART support" 59 tristate "AMD Opteron/Athlon64 on-CPU GART support"
60 depends on AGP && X86 && K8_NB 60 depends on AGP && X86 && AMD_NB
61 help 61 help
62 This option gives you AGP support for the GLX component of 62 This option gives you AGP support for the GLX component of
63 X using the on-CPU northbridge of the AMD Athlon64/Opteron CPUs. 63 X using the on-CPU northbridge of the AMD Athlon64/Opteron CPUs.
diff --git a/drivers/char/agp/amd64-agp.c b/drivers/char/agp/amd64-agp.c
index 70312da4c968..42396df55556 100644
--- a/drivers/char/agp/amd64-agp.c
+++ b/drivers/char/agp/amd64-agp.c
@@ -15,7 +15,7 @@
15#include <linux/mmzone.h> 15#include <linux/mmzone.h>
16#include <asm/page.h> /* PAGE_SIZE */ 16#include <asm/page.h> /* PAGE_SIZE */
17#include <asm/e820.h> 17#include <asm/e820.h>
18#include <asm/k8.h> 18#include <asm/amd_nb.h>
19#include <asm/gart.h> 19#include <asm/gart.h>
20#include "agp.h" 20#include "agp.h"
21 21
@@ -124,7 +124,7 @@ static int amd64_fetch_size(void)
124 u32 temp; 124 u32 temp;
125 struct aper_size_info_32 *values; 125 struct aper_size_info_32 *values;
126 126
127 dev = k8_northbridges[0]; 127 dev = k8_northbridges.nb_misc[0];
128 if (dev==NULL) 128 if (dev==NULL)
129 return 0; 129 return 0;
130 130
@@ -181,10 +181,14 @@ static int amd_8151_configure(void)
181 unsigned long gatt_bus = virt_to_phys(agp_bridge->gatt_table_real); 181 unsigned long gatt_bus = virt_to_phys(agp_bridge->gatt_table_real);
182 int i; 182 int i;
183 183
184 if (!k8_northbridges.gart_supported)
185 return 0;
186
184 /* Configure AGP regs in each x86-64 host bridge. */ 187 /* Configure AGP regs in each x86-64 host bridge. */
185 for (i = 0; i < num_k8_northbridges; i++) { 188 for (i = 0; i < k8_northbridges.num; i++) {
186 agp_bridge->gart_bus_addr = 189 agp_bridge->gart_bus_addr =
187 amd64_configure(k8_northbridges[i], gatt_bus); 190 amd64_configure(k8_northbridges.nb_misc[i],
191 gatt_bus);
188 } 192 }
189 k8_flush_garts(); 193 k8_flush_garts();
190 return 0; 194 return 0;
@@ -195,11 +199,15 @@ static void amd64_cleanup(void)
195{ 199{
196 u32 tmp; 200 u32 tmp;
197 int i; 201 int i;
198 for (i = 0; i < num_k8_northbridges; i++) { 202
199 struct pci_dev *dev = k8_northbridges[i]; 203 if (!k8_northbridges.gart_supported)
204 return;
205
206 for (i = 0; i < k8_northbridges.num; i++) {
207 struct pci_dev *dev = k8_northbridges.nb_misc[i];
200 /* disable gart translation */ 208 /* disable gart translation */
201 pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &tmp); 209 pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &tmp);
202 tmp &= ~AMD64_GARTEN; 210 tmp &= ~GARTEN;
203 pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, tmp); 211 pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, tmp);
204 } 212 }
205} 213}
@@ -313,22 +321,25 @@ static __devinit int fix_northbridge(struct pci_dev *nb, struct pci_dev *agp,
313 if (order < 0 || !agp_aperture_valid(aper, (32*1024*1024)<<order)) 321 if (order < 0 || !agp_aperture_valid(aper, (32*1024*1024)<<order))
314 return -1; 322 return -1;
315 323
316 pci_write_config_dword(nb, AMD64_GARTAPERTURECTL, order << 1); 324 gart_set_size_and_enable(nb, order);
317 pci_write_config_dword(nb, AMD64_GARTAPERTUREBASE, aper >> 25); 325 pci_write_config_dword(nb, AMD64_GARTAPERTUREBASE, aper >> 25);
318 326
319 return 0; 327 return 0;
320} 328}
321 329
322static __devinit int cache_nbs (struct pci_dev *pdev, u32 cap_ptr) 330static __devinit int cache_nbs(struct pci_dev *pdev, u32 cap_ptr)
323{ 331{
324 int i; 332 int i;
325 333
326 if (cache_k8_northbridges() < 0) 334 if (cache_k8_northbridges() < 0)
327 return -ENODEV; 335 return -ENODEV;
328 336
337 if (!k8_northbridges.gart_supported)
338 return -ENODEV;
339
329 i = 0; 340 i = 0;
330 for (i = 0; i < num_k8_northbridges; i++) { 341 for (i = 0; i < k8_northbridges.num; i++) {
331 struct pci_dev *dev = k8_northbridges[i]; 342 struct pci_dev *dev = k8_northbridges.nb_misc[i];
332 if (fix_northbridge(dev, pdev, cap_ptr) < 0) { 343 if (fix_northbridge(dev, pdev, cap_ptr) < 0) {
333 dev_err(&dev->dev, "no usable aperture found\n"); 344 dev_err(&dev->dev, "no usable aperture found\n");
334#ifdef __x86_64__ 345#ifdef __x86_64__
@@ -405,7 +416,8 @@ static int __devinit uli_agp_init(struct pci_dev *pdev)
405 } 416 }
406 417
407 /* shadow x86-64 registers into ULi registers */ 418 /* shadow x86-64 registers into ULi registers */
408 pci_read_config_dword (k8_northbridges[0], AMD64_GARTAPERTUREBASE, &httfea); 419 pci_read_config_dword (k8_northbridges.nb_misc[0], AMD64_GARTAPERTUREBASE,
420 &httfea);
409 421
410 /* if x86-64 aperture base is beyond 4G, exit here */ 422 /* if x86-64 aperture base is beyond 4G, exit here */
411 if ((httfea & 0x7fff) >> (32 - 25)) { 423 if ((httfea & 0x7fff) >> (32 - 25)) {
@@ -472,7 +484,8 @@ static int nforce3_agp_init(struct pci_dev *pdev)
472 pci_write_config_dword(dev1, NVIDIA_X86_64_1_APSIZE, tmp); 484 pci_write_config_dword(dev1, NVIDIA_X86_64_1_APSIZE, tmp);
473 485
474 /* shadow x86-64 registers into NVIDIA registers */ 486 /* shadow x86-64 registers into NVIDIA registers */
475 pci_read_config_dword (k8_northbridges[0], AMD64_GARTAPERTUREBASE, &apbase); 487 pci_read_config_dword (k8_northbridges.nb_misc[0], AMD64_GARTAPERTUREBASE,
488 &apbase);
476 489
477 /* if x86-64 aperture base is beyond 4G, exit here */ 490 /* if x86-64 aperture base is beyond 4G, exit here */
478 if ( (apbase & 0x7fff) >> (32 - 25) ) { 491 if ( (apbase & 0x7fff) >> (32 - 25) ) {
diff --git a/drivers/char/agp/generic.c b/drivers/char/agp/generic.c
index d2abf5143983..64255cef8a7d 100644
--- a/drivers/char/agp/generic.c
+++ b/drivers/char/agp/generic.c
@@ -984,7 +984,9 @@ int agp_generic_create_gatt_table(struct agp_bridge_data *bridge)
984 984
985 bridge->driver->cache_flush(); 985 bridge->driver->cache_flush();
986#ifdef CONFIG_X86 986#ifdef CONFIG_X86
987 set_memory_uc((unsigned long)table, 1 << page_order); 987 if (set_memory_uc((unsigned long)table, 1 << page_order))
988 printk(KERN_WARNING "Could not set GATT table memory to UC!");
989
988 bridge->gatt_table = (void *)table; 990 bridge->gatt_table = (void *)table;
989#else 991#else
990 bridge->gatt_table = ioremap_nocache(virt_to_phys(table), 992 bridge->gatt_table = ioremap_nocache(virt_to_phys(table),
diff --git a/drivers/char/tpm/tpm.c b/drivers/char/tpm/tpm.c
index 05ad4a17a28f..7c4133582dba 100644
--- a/drivers/char/tpm/tpm.c
+++ b/drivers/char/tpm/tpm.c
@@ -47,6 +47,16 @@ enum tpm_duration {
47#define TPM_MAX_PROTECTED_ORDINAL 12 47#define TPM_MAX_PROTECTED_ORDINAL 12
48#define TPM_PROTECTED_ORDINAL_MASK 0xFF 48#define TPM_PROTECTED_ORDINAL_MASK 0xFF
49 49
50/*
51 * Bug workaround - some TPM's don't flush the most
52 * recently changed pcr on suspend, so force the flush
53 * with an extend to the selected _unused_ non-volatile pcr.
54 */
55static int tpm_suspend_pcr;
56module_param_named(suspend_pcr, tpm_suspend_pcr, uint, 0644);
57MODULE_PARM_DESC(suspend_pcr,
58 "PCR to use for dummy writes to faciltate flush on suspend.");
59
50static LIST_HEAD(tpm_chip_list); 60static LIST_HEAD(tpm_chip_list);
51static DEFINE_SPINLOCK(driver_lock); 61static DEFINE_SPINLOCK(driver_lock);
52static DECLARE_BITMAP(dev_mask, TPM_NUM_DEVICES); 62static DECLARE_BITMAP(dev_mask, TPM_NUM_DEVICES);
@@ -1077,18 +1087,6 @@ static struct tpm_input_header savestate_header = {
1077 .ordinal = TPM_ORD_SAVESTATE 1087 .ordinal = TPM_ORD_SAVESTATE
1078}; 1088};
1079 1089
1080/* Bug workaround - some TPM's don't flush the most
1081 * recently changed pcr on suspend, so force the flush
1082 * with an extend to the selected _unused_ non-volatile pcr.
1083 */
1084static int tpm_suspend_pcr;
1085static int __init tpm_suspend_setup(char *str)
1086{
1087 get_option(&str, &tpm_suspend_pcr);
1088 return 1;
1089}
1090__setup("tpm_suspend_pcr=", tpm_suspend_setup);
1091
1092/* 1090/*
1093 * We are about to suspend. Save the TPM state 1091 * We are about to suspend. Save the TPM state
1094 * so that it can be restored. 1092 * so that it can be restored.
diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index 0f69c5ec0ecd..6c1b676643a9 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -48,6 +48,9 @@ struct ports_driver_data {
48 /* Used for exporting per-port information to debugfs */ 48 /* Used for exporting per-port information to debugfs */
49 struct dentry *debugfs_dir; 49 struct dentry *debugfs_dir;
50 50
51 /* List of all the devices we're handling */
52 struct list_head portdevs;
53
51 /* Number of devices this driver is handling */ 54 /* Number of devices this driver is handling */
52 unsigned int index; 55 unsigned int index;
53 56
@@ -108,6 +111,9 @@ struct port_buffer {
108 * ports for that device (vdev->priv). 111 * ports for that device (vdev->priv).
109 */ 112 */
110struct ports_device { 113struct ports_device {
114 /* Next portdev in the list, head is in the pdrvdata struct */
115 struct list_head list;
116
111 /* 117 /*
112 * Workqueue handlers where we process deferred work after 118 * Workqueue handlers where we process deferred work after
113 * notification 119 * notification
@@ -178,15 +184,21 @@ struct port {
178 struct console cons; 184 struct console cons;
179 185
180 /* Each port associates with a separate char device */ 186 /* Each port associates with a separate char device */
181 struct cdev cdev; 187 struct cdev *cdev;
182 struct device *dev; 188 struct device *dev;
183 189
190 /* Reference-counting to handle port hot-unplugs and file operations */
191 struct kref kref;
192
184 /* A waitqueue for poll() or blocking read operations */ 193 /* A waitqueue for poll() or blocking read operations */
185 wait_queue_head_t waitqueue; 194 wait_queue_head_t waitqueue;
186 195
187 /* The 'name' of the port that we expose via sysfs properties */ 196 /* The 'name' of the port that we expose via sysfs properties */
188 char *name; 197 char *name;
189 198
199 /* We can notify apps of host connect / disconnect events via SIGIO */
200 struct fasync_struct *async_queue;
201
190 /* The 'id' to identify the port with the Host */ 202 /* The 'id' to identify the port with the Host */
191 u32 id; 203 u32 id;
192 204
@@ -221,6 +233,41 @@ out:
221 return port; 233 return port;
222} 234}
223 235
236static struct port *find_port_by_devt_in_portdev(struct ports_device *portdev,
237 dev_t dev)
238{
239 struct port *port;
240 unsigned long flags;
241
242 spin_lock_irqsave(&portdev->ports_lock, flags);
243 list_for_each_entry(port, &portdev->ports, list)
244 if (port->cdev->dev == dev)
245 goto out;
246 port = NULL;
247out:
248 spin_unlock_irqrestore(&portdev->ports_lock, flags);
249
250 return port;
251}
252
253static struct port *find_port_by_devt(dev_t dev)
254{
255 struct ports_device *portdev;
256 struct port *port;
257 unsigned long flags;
258
259 spin_lock_irqsave(&pdrvdata_lock, flags);
260 list_for_each_entry(portdev, &pdrvdata.portdevs, list) {
261 port = find_port_by_devt_in_portdev(portdev, dev);
262 if (port)
263 goto out;
264 }
265 port = NULL;
266out:
267 spin_unlock_irqrestore(&pdrvdata_lock, flags);
268 return port;
269}
270
224static struct port *find_port_by_id(struct ports_device *portdev, u32 id) 271static struct port *find_port_by_id(struct ports_device *portdev, u32 id)
225{ 272{
226 struct port *port; 273 struct port *port;
@@ -410,7 +457,10 @@ static ssize_t __send_control_msg(struct ports_device *portdev, u32 port_id,
410static ssize_t send_control_msg(struct port *port, unsigned int event, 457static ssize_t send_control_msg(struct port *port, unsigned int event,
411 unsigned int value) 458 unsigned int value)
412{ 459{
413 return __send_control_msg(port->portdev, port->id, event, value); 460 /* Did the port get unplugged before userspace closed it? */
461 if (port->portdev)
462 return __send_control_msg(port->portdev, port->id, event, value);
463 return 0;
414} 464}
415 465
416/* Callers must take the port->outvq_lock */ 466/* Callers must take the port->outvq_lock */
@@ -525,6 +575,10 @@ static ssize_t fill_readbuf(struct port *port, char *out_buf, size_t out_count,
525/* The condition that must be true for polling to end */ 575/* The condition that must be true for polling to end */
526static bool will_read_block(struct port *port) 576static bool will_read_block(struct port *port)
527{ 577{
578 if (!port->guest_connected) {
579 /* Port got hot-unplugged. Let's exit. */
580 return false;
581 }
528 return !port_has_data(port) && port->host_connected; 582 return !port_has_data(port) && port->host_connected;
529} 583}
530 584
@@ -575,6 +629,9 @@ static ssize_t port_fops_read(struct file *filp, char __user *ubuf,
575 if (ret < 0) 629 if (ret < 0)
576 return ret; 630 return ret;
577 } 631 }
632 /* Port got hot-unplugged. */
633 if (!port->guest_connected)
634 return -ENODEV;
578 /* 635 /*
579 * We could've received a disconnection message while we were 636 * We could've received a disconnection message while we were
580 * waiting for more data. 637 * waiting for more data.
@@ -616,6 +673,9 @@ static ssize_t port_fops_write(struct file *filp, const char __user *ubuf,
616 if (ret < 0) 673 if (ret < 0)
617 return ret; 674 return ret;
618 } 675 }
676 /* Port got hot-unplugged. */
677 if (!port->guest_connected)
678 return -ENODEV;
619 679
620 count = min((size_t)(32 * 1024), count); 680 count = min((size_t)(32 * 1024), count);
621 681
@@ -656,6 +716,10 @@ static unsigned int port_fops_poll(struct file *filp, poll_table *wait)
656 port = filp->private_data; 716 port = filp->private_data;
657 poll_wait(filp, &port->waitqueue, wait); 717 poll_wait(filp, &port->waitqueue, wait);
658 718
719 if (!port->guest_connected) {
720 /* Port got unplugged */
721 return POLLHUP;
722 }
659 ret = 0; 723 ret = 0;
660 if (!will_read_block(port)) 724 if (!will_read_block(port))
661 ret |= POLLIN | POLLRDNORM; 725 ret |= POLLIN | POLLRDNORM;
@@ -667,6 +731,8 @@ static unsigned int port_fops_poll(struct file *filp, poll_table *wait)
667 return ret; 731 return ret;
668} 732}
669 733
734static void remove_port(struct kref *kref);
735
670static int port_fops_release(struct inode *inode, struct file *filp) 736static int port_fops_release(struct inode *inode, struct file *filp)
671{ 737{
672 struct port *port; 738 struct port *port;
@@ -687,6 +753,16 @@ static int port_fops_release(struct inode *inode, struct file *filp)
687 reclaim_consumed_buffers(port); 753 reclaim_consumed_buffers(port);
688 spin_unlock_irq(&port->outvq_lock); 754 spin_unlock_irq(&port->outvq_lock);
689 755
756 /*
757 * Locks aren't necessary here as a port can't be opened after
758 * unplug, and if a port isn't unplugged, a kref would already
759 * exist for the port. Plus, taking ports_lock here would
760 * create a dependency on other locks taken by functions
761 * inside remove_port if we're the last holder of the port,
762 * creating many problems.
763 */
764 kref_put(&port->kref, remove_port);
765
690 return 0; 766 return 0;
691} 767}
692 768
@@ -694,22 +770,31 @@ static int port_fops_open(struct inode *inode, struct file *filp)
694{ 770{
695 struct cdev *cdev = inode->i_cdev; 771 struct cdev *cdev = inode->i_cdev;
696 struct port *port; 772 struct port *port;
773 int ret;
697 774
698 port = container_of(cdev, struct port, cdev); 775 port = find_port_by_devt(cdev->dev);
699 filp->private_data = port; 776 filp->private_data = port;
700 777
778 /* Prevent against a port getting hot-unplugged at the same time */
779 spin_lock_irq(&port->portdev->ports_lock);
780 kref_get(&port->kref);
781 spin_unlock_irq(&port->portdev->ports_lock);
782
701 /* 783 /*
702 * Don't allow opening of console port devices -- that's done 784 * Don't allow opening of console port devices -- that's done
703 * via /dev/hvc 785 * via /dev/hvc
704 */ 786 */
705 if (is_console_port(port)) 787 if (is_console_port(port)) {
706 return -ENXIO; 788 ret = -ENXIO;
789 goto out;
790 }
707 791
708 /* Allow only one process to open a particular port at a time */ 792 /* Allow only one process to open a particular port at a time */
709 spin_lock_irq(&port->inbuf_lock); 793 spin_lock_irq(&port->inbuf_lock);
710 if (port->guest_connected) { 794 if (port->guest_connected) {
711 spin_unlock_irq(&port->inbuf_lock); 795 spin_unlock_irq(&port->inbuf_lock);
712 return -EMFILE; 796 ret = -EMFILE;
797 goto out;
713 } 798 }
714 799
715 port->guest_connected = true; 800 port->guest_connected = true;
@@ -724,10 +809,23 @@ static int port_fops_open(struct inode *inode, struct file *filp)
724 reclaim_consumed_buffers(port); 809 reclaim_consumed_buffers(port);
725 spin_unlock_irq(&port->outvq_lock); 810 spin_unlock_irq(&port->outvq_lock);
726 811
812 nonseekable_open(inode, filp);
813
727 /* Notify host of port being opened */ 814 /* Notify host of port being opened */
728 send_control_msg(filp->private_data, VIRTIO_CONSOLE_PORT_OPEN, 1); 815 send_control_msg(filp->private_data, VIRTIO_CONSOLE_PORT_OPEN, 1);
729 816
730 return 0; 817 return 0;
818out:
819 kref_put(&port->kref, remove_port);
820 return ret;
821}
822
823static int port_fops_fasync(int fd, struct file *filp, int mode)
824{
825 struct port *port;
826
827 port = filp->private_data;
828 return fasync_helper(fd, filp, mode, &port->async_queue);
731} 829}
732 830
733/* 831/*
@@ -743,6 +841,8 @@ static const struct file_operations port_fops = {
743 .write = port_fops_write, 841 .write = port_fops_write,
744 .poll = port_fops_poll, 842 .poll = port_fops_poll,
745 .release = port_fops_release, 843 .release = port_fops_release,
844 .fasync = port_fops_fasync,
845 .llseek = no_llseek,
746}; 846};
747 847
748/* 848/*
@@ -1001,6 +1101,12 @@ static unsigned int fill_queue(struct virtqueue *vq, spinlock_t *lock)
1001 return nr_added_bufs; 1101 return nr_added_bufs;
1002} 1102}
1003 1103
1104static void send_sigio_to_port(struct port *port)
1105{
1106 if (port->async_queue && port->guest_connected)
1107 kill_fasync(&port->async_queue, SIGIO, POLL_OUT);
1108}
1109
1004static int add_port(struct ports_device *portdev, u32 id) 1110static int add_port(struct ports_device *portdev, u32 id)
1005{ 1111{
1006 char debugfs_name[16]; 1112 char debugfs_name[16];
@@ -1015,6 +1121,7 @@ static int add_port(struct ports_device *portdev, u32 id)
1015 err = -ENOMEM; 1121 err = -ENOMEM;
1016 goto fail; 1122 goto fail;
1017 } 1123 }
1124 kref_init(&port->kref);
1018 1125
1019 port->portdev = portdev; 1126 port->portdev = portdev;
1020 port->id = id; 1127 port->id = id;
@@ -1022,6 +1129,7 @@ static int add_port(struct ports_device *portdev, u32 id)
1022 port->name = NULL; 1129 port->name = NULL;
1023 port->inbuf = NULL; 1130 port->inbuf = NULL;
1024 port->cons.hvc = NULL; 1131 port->cons.hvc = NULL;
1132 port->async_queue = NULL;
1025 1133
1026 port->cons.ws.ws_row = port->cons.ws.ws_col = 0; 1134 port->cons.ws.ws_row = port->cons.ws.ws_col = 0;
1027 1135
@@ -1032,14 +1140,20 @@ static int add_port(struct ports_device *portdev, u32 id)
1032 port->in_vq = portdev->in_vqs[port->id]; 1140 port->in_vq = portdev->in_vqs[port->id];
1033 port->out_vq = portdev->out_vqs[port->id]; 1141 port->out_vq = portdev->out_vqs[port->id];
1034 1142
1035 cdev_init(&port->cdev, &port_fops); 1143 port->cdev = cdev_alloc();
1144 if (!port->cdev) {
1145 dev_err(&port->portdev->vdev->dev, "Error allocating cdev\n");
1146 err = -ENOMEM;
1147 goto free_port;
1148 }
1149 port->cdev->ops = &port_fops;
1036 1150
1037 devt = MKDEV(portdev->chr_major, id); 1151 devt = MKDEV(portdev->chr_major, id);
1038 err = cdev_add(&port->cdev, devt, 1); 1152 err = cdev_add(port->cdev, devt, 1);
1039 if (err < 0) { 1153 if (err < 0) {
1040 dev_err(&port->portdev->vdev->dev, 1154 dev_err(&port->portdev->vdev->dev,
1041 "Error %d adding cdev for port %u\n", err, id); 1155 "Error %d adding cdev for port %u\n", err, id);
1042 goto free_port; 1156 goto free_cdev;
1043 } 1157 }
1044 port->dev = device_create(pdrvdata.class, &port->portdev->vdev->dev, 1158 port->dev = device_create(pdrvdata.class, &port->portdev->vdev->dev,
1045 devt, port, "vport%up%u", 1159 devt, port, "vport%up%u",
@@ -1104,7 +1218,7 @@ free_inbufs:
1104free_device: 1218free_device:
1105 device_destroy(pdrvdata.class, port->dev->devt); 1219 device_destroy(pdrvdata.class, port->dev->devt);
1106free_cdev: 1220free_cdev:
1107 cdev_del(&port->cdev); 1221 cdev_del(port->cdev);
1108free_port: 1222free_port:
1109 kfree(port); 1223 kfree(port);
1110fail: 1224fail:
@@ -1113,21 +1227,45 @@ fail:
1113 return err; 1227 return err;
1114} 1228}
1115 1229
1116/* Remove all port-specific data. */ 1230/* No users remain, remove all port-specific data. */
1117static int remove_port(struct port *port) 1231static void remove_port(struct kref *kref)
1232{
1233 struct port *port;
1234
1235 port = container_of(kref, struct port, kref);
1236
1237 sysfs_remove_group(&port->dev->kobj, &port_attribute_group);
1238 device_destroy(pdrvdata.class, port->dev->devt);
1239 cdev_del(port->cdev);
1240
1241 kfree(port->name);
1242
1243 debugfs_remove(port->debugfs_file);
1244
1245 kfree(port);
1246}
1247
1248/*
1249 * Port got unplugged. Remove port from portdev's list and drop the
1250 * kref reference. If no userspace has this port opened, it will
1251 * result in immediate removal the port.
1252 */
1253static void unplug_port(struct port *port)
1118{ 1254{
1119 struct port_buffer *buf; 1255 struct port_buffer *buf;
1120 1256
1257 spin_lock_irq(&port->portdev->ports_lock);
1258 list_del(&port->list);
1259 spin_unlock_irq(&port->portdev->ports_lock);
1260
1121 if (port->guest_connected) { 1261 if (port->guest_connected) {
1122 port->guest_connected = false; 1262 port->guest_connected = false;
1123 port->host_connected = false; 1263 port->host_connected = false;
1124 wake_up_interruptible(&port->waitqueue); 1264 wake_up_interruptible(&port->waitqueue);
1125 send_control_msg(port, VIRTIO_CONSOLE_PORT_OPEN, 0);
1126 }
1127 1265
1128 spin_lock_irq(&port->portdev->ports_lock); 1266 /* Let the app know the port is going down. */
1129 list_del(&port->list); 1267 send_sigio_to_port(port);
1130 spin_unlock_irq(&port->portdev->ports_lock); 1268 }
1131 1269
1132 if (is_console_port(port)) { 1270 if (is_console_port(port)) {
1133 spin_lock_irq(&pdrvdata_lock); 1271 spin_lock_irq(&pdrvdata_lock);
@@ -1146,9 +1284,6 @@ static int remove_port(struct port *port)
1146 hvc_remove(port->cons.hvc); 1284 hvc_remove(port->cons.hvc);
1147#endif 1285#endif
1148 } 1286 }
1149 sysfs_remove_group(&port->dev->kobj, &port_attribute_group);
1150 device_destroy(pdrvdata.class, port->dev->devt);
1151 cdev_del(&port->cdev);
1152 1287
1153 /* Remove unused data this port might have received. */ 1288 /* Remove unused data this port might have received. */
1154 discard_port_data(port); 1289 discard_port_data(port);
@@ -1159,12 +1294,19 @@ static int remove_port(struct port *port)
1159 while ((buf = virtqueue_detach_unused_buf(port->in_vq))) 1294 while ((buf = virtqueue_detach_unused_buf(port->in_vq)))
1160 free_buf(buf); 1295 free_buf(buf);
1161 1296
1162 kfree(port->name); 1297 /*
1163 1298 * We should just assume the device itself has gone off --
1164 debugfs_remove(port->debugfs_file); 1299 * else a close on an open port later will try to send out a
1300 * control message.
1301 */
1302 port->portdev = NULL;
1165 1303
1166 kfree(port); 1304 /*
1167 return 0; 1305 * Locks around here are not necessary - a port can't be
1306 * opened after we removed the port struct from ports_list
1307 * above.
1308 */
1309 kref_put(&port->kref, remove_port);
1168} 1310}
1169 1311
1170/* Any private messages that the Host and Guest want to share */ 1312/* Any private messages that the Host and Guest want to share */
@@ -1203,7 +1345,7 @@ static void handle_control_message(struct ports_device *portdev,
1203 add_port(portdev, cpkt->id); 1345 add_port(portdev, cpkt->id);
1204 break; 1346 break;
1205 case VIRTIO_CONSOLE_PORT_REMOVE: 1347 case VIRTIO_CONSOLE_PORT_REMOVE:
1206 remove_port(port); 1348 unplug_port(port);
1207 break; 1349 break;
1208 case VIRTIO_CONSOLE_CONSOLE_PORT: 1350 case VIRTIO_CONSOLE_CONSOLE_PORT:
1209 if (!cpkt->value) 1351 if (!cpkt->value)
@@ -1245,6 +1387,12 @@ static void handle_control_message(struct ports_device *portdev,
1245 spin_lock_irq(&port->outvq_lock); 1387 spin_lock_irq(&port->outvq_lock);
1246 reclaim_consumed_buffers(port); 1388 reclaim_consumed_buffers(port);
1247 spin_unlock_irq(&port->outvq_lock); 1389 spin_unlock_irq(&port->outvq_lock);
1390
1391 /*
1392 * If the guest is connected, it'll be interested in
1393 * knowing the host connection state changed.
1394 */
1395 send_sigio_to_port(port);
1248 break; 1396 break;
1249 case VIRTIO_CONSOLE_PORT_NAME: 1397 case VIRTIO_CONSOLE_PORT_NAME:
1250 /* 1398 /*
@@ -1341,6 +1489,9 @@ static void in_intr(struct virtqueue *vq)
1341 1489
1342 wake_up_interruptible(&port->waitqueue); 1490 wake_up_interruptible(&port->waitqueue);
1343 1491
1492 /* Send a SIGIO indicating new data in case the process asked for it */
1493 send_sigio_to_port(port);
1494
1344 if (is_console_port(port) && hvc_poll(port->cons.hvc)) 1495 if (is_console_port(port) && hvc_poll(port->cons.hvc))
1345 hvc_kick(); 1496 hvc_kick();
1346} 1497}
@@ -1577,6 +1728,10 @@ static int __devinit virtcons_probe(struct virtio_device *vdev)
1577 add_port(portdev, 0); 1728 add_port(portdev, 0);
1578 } 1729 }
1579 1730
1731 spin_lock_irq(&pdrvdata_lock);
1732 list_add_tail(&portdev->list, &pdrvdata.portdevs);
1733 spin_unlock_irq(&pdrvdata_lock);
1734
1580 __send_control_msg(portdev, VIRTIO_CONSOLE_BAD_ID, 1735 __send_control_msg(portdev, VIRTIO_CONSOLE_BAD_ID,
1581 VIRTIO_CONSOLE_DEVICE_READY, 1); 1736 VIRTIO_CONSOLE_DEVICE_READY, 1);
1582 return 0; 1737 return 0;
@@ -1600,23 +1755,41 @@ static void virtcons_remove(struct virtio_device *vdev)
1600{ 1755{
1601 struct ports_device *portdev; 1756 struct ports_device *portdev;
1602 struct port *port, *port2; 1757 struct port *port, *port2;
1603 struct port_buffer *buf;
1604 unsigned int len;
1605 1758
1606 portdev = vdev->priv; 1759 portdev = vdev->priv;
1607 1760
1761 spin_lock_irq(&pdrvdata_lock);
1762 list_del(&portdev->list);
1763 spin_unlock_irq(&pdrvdata_lock);
1764
1765 /* Disable interrupts for vqs */
1766 vdev->config->reset(vdev);
1767 /* Finish up work that's lined up */
1608 cancel_work_sync(&portdev->control_work); 1768 cancel_work_sync(&portdev->control_work);
1609 1769
1610 list_for_each_entry_safe(port, port2, &portdev->ports, list) 1770 list_for_each_entry_safe(port, port2, &portdev->ports, list)
1611 remove_port(port); 1771 unplug_port(port);
1612 1772
1613 unregister_chrdev(portdev->chr_major, "virtio-portsdev"); 1773 unregister_chrdev(portdev->chr_major, "virtio-portsdev");
1614 1774
1615 while ((buf = virtqueue_get_buf(portdev->c_ivq, &len))) 1775 /*
1616 free_buf(buf); 1776 * When yanking out a device, we immediately lose the
1777 * (device-side) queues. So there's no point in keeping the
1778 * guest side around till we drop our final reference. This
1779 * also means that any ports which are in an open state will
1780 * have to just stop using the port, as the vqs are going
1781 * away.
1782 */
1783 if (use_multiport(portdev)) {
1784 struct port_buffer *buf;
1785 unsigned int len;
1617 1786
1618 while ((buf = virtqueue_detach_unused_buf(portdev->c_ivq))) 1787 while ((buf = virtqueue_get_buf(portdev->c_ivq, &len)))
1619 free_buf(buf); 1788 free_buf(buf);
1789
1790 while ((buf = virtqueue_detach_unused_buf(portdev->c_ivq)))
1791 free_buf(buf);
1792 }
1620 1793
1621 vdev->config->del_vqs(vdev); 1794 vdev->config->del_vqs(vdev);
1622 kfree(portdev->in_vqs); 1795 kfree(portdev->in_vqs);
@@ -1663,6 +1836,7 @@ static int __init init(void)
1663 PTR_ERR(pdrvdata.debugfs_dir)); 1836 PTR_ERR(pdrvdata.debugfs_dir));
1664 } 1837 }
1665 INIT_LIST_HEAD(&pdrvdata.consoles); 1838 INIT_LIST_HEAD(&pdrvdata.consoles);
1839 INIT_LIST_HEAD(&pdrvdata.portdevs);
1666 1840
1667 return register_virtio_driver(&virtio_console); 1841 return register_virtio_driver(&virtio_console);
1668} 1842}
diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
index 70bb350de996..9dbb28b9559f 100644
--- a/drivers/edac/Kconfig
+++ b/drivers/edac/Kconfig
@@ -39,7 +39,7 @@ config EDAC_DEBUG
39 there're four debug levels (x=0,1,2,3 from low to high). 39 there're four debug levels (x=0,1,2,3 from low to high).
40 Usually you should select 'N'. 40 Usually you should select 'N'.
41 41
42 config EDAC_DECODE_MCE 42config EDAC_DECODE_MCE
43 tristate "Decode MCEs in human-readable form (only on AMD for now)" 43 tristate "Decode MCEs in human-readable form (only on AMD for now)"
44 depends on CPU_SUP_AMD && X86_MCE 44 depends on CPU_SUP_AMD && X86_MCE
45 default y 45 default y
@@ -51,6 +51,16 @@ config EDAC_DEBUG
51 which occur really early upon boot, before the module infrastructure 51 which occur really early upon boot, before the module infrastructure
52 has been initialized. 52 has been initialized.
53 53
54config EDAC_MCE_INJ
55 tristate "Simple MCE injection interface over /sysfs"
56 depends on EDAC_DECODE_MCE
57 default n
58 help
59 This is a simple interface to inject MCEs over /sysfs and test
60 the MCE decoding code in EDAC.
61
62 This is currently AMD-only.
63
54config EDAC_MM_EDAC 64config EDAC_MM_EDAC
55 tristate "Main Memory EDAC (Error Detection And Correction) reporting" 65 tristate "Main Memory EDAC (Error Detection And Correction) reporting"
56 help 66 help
@@ -66,13 +76,13 @@ config EDAC_MCE
66 76
67config EDAC_AMD64 77config EDAC_AMD64
68 tristate "AMD64 (Opteron, Athlon64) K8, F10h, F11h" 78 tristate "AMD64 (Opteron, Athlon64) K8, F10h, F11h"
69 depends on EDAC_MM_EDAC && K8_NB && X86_64 && PCI && EDAC_DECODE_MCE 79 depends on EDAC_MM_EDAC && AMD_NB && X86_64 && PCI && EDAC_DECODE_MCE
70 help 80 help
71 Support for error detection and correction on the AMD 64 81 Support for error detection and correction on the AMD 64
72 Families of Memory Controllers (K8, F10h and F11h) 82 Families of Memory Controllers (K8, F10h and F11h)
73 83
74config EDAC_AMD64_ERROR_INJECTION 84config EDAC_AMD64_ERROR_INJECTION
75 bool "Sysfs Error Injection facilities" 85 bool "Sysfs HW Error injection facilities"
76 depends on EDAC_AMD64 86 depends on EDAC_AMD64
77 help 87 help
78 Recent Opterons (Family 10h and later) provide for Memory Error 88 Recent Opterons (Family 10h and later) provide for Memory Error
diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile
index ca6b1bb24ccc..32c7bc93c525 100644
--- a/drivers/edac/Makefile
+++ b/drivers/edac/Makefile
@@ -17,6 +17,9 @@ ifdef CONFIG_PCI
17edac_core-objs += edac_pci.o edac_pci_sysfs.o 17edac_core-objs += edac_pci.o edac_pci_sysfs.o
18endif 18endif
19 19
20obj-$(CONFIG_EDAC_MCE_INJ) += mce_amd_inj.o
21
22edac_mce_amd-objs := mce_amd.o
20obj-$(CONFIG_EDAC_DECODE_MCE) += edac_mce_amd.o 23obj-$(CONFIG_EDAC_DECODE_MCE) += edac_mce_amd.o
21 24
22obj-$(CONFIG_EDAC_AMD76X) += amd76x_edac.o 25obj-$(CONFIG_EDAC_AMD76X) += amd76x_edac.o
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index e7d5d6b5dcf6..8521401bbd75 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -1,5 +1,5 @@
1#include "amd64_edac.h" 1#include "amd64_edac.h"
2#include <asm/k8.h> 2#include <asm/amd_nb.h>
3 3
4static struct edac_pci_ctl_info *amd64_ctl_pci; 4static struct edac_pci_ctl_info *amd64_ctl_pci;
5 5
@@ -2073,11 +2073,18 @@ static inline void __amd64_decode_bus_error(struct mem_ctl_info *mci,
2073 amd64_handle_ue(mci, info); 2073 amd64_handle_ue(mci, info);
2074} 2074}
2075 2075
2076void amd64_decode_bus_error(int node_id, struct err_regs *regs) 2076void amd64_decode_bus_error(int node_id, struct mce *m, u32 nbcfg)
2077{ 2077{
2078 struct mem_ctl_info *mci = mci_lookup[node_id]; 2078 struct mem_ctl_info *mci = mci_lookup[node_id];
2079 struct err_regs regs;
2079 2080
2080 __amd64_decode_bus_error(mci, regs); 2081 regs.nbsl = (u32) m->status;
2082 regs.nbsh = (u32)(m->status >> 32);
2083 regs.nbeal = (u32) m->addr;
2084 regs.nbeah = (u32)(m->addr >> 32);
2085 regs.nbcfg = nbcfg;
2086
2087 __amd64_decode_bus_error(mci, &regs);
2081 2088
2082 /* 2089 /*
2083 * Check the UE bit of the NB status high register, if set generate some 2090 * Check the UE bit of the NB status high register, if set generate some
@@ -2086,7 +2093,7 @@ void amd64_decode_bus_error(int node_id, struct err_regs *regs)
2086 * 2093 *
2087 * FIXME: this should go somewhere else, if at all. 2094 * FIXME: this should go somewhere else, if at all.
2088 */ 2095 */
2089 if (regs->nbsh & K8_NBSH_UC_ERR && !report_gart_errors) 2096 if (regs.nbsh & K8_NBSH_UC_ERR && !report_gart_errors)
2090 edac_mc_handle_ue_no_info(mci, "UE bit is set"); 2097 edac_mc_handle_ue_no_info(mci, "UE bit is set");
2091 2098
2092} 2099}
@@ -2927,7 +2934,7 @@ static int __init amd64_edac_init(void)
2927 * to finish initialization of the MC instances. 2934 * to finish initialization of the MC instances.
2928 */ 2935 */
2929 err = -ENODEV; 2936 err = -ENODEV;
2930 for (nb = 0; nb < num_k8_northbridges; nb++) { 2937 for (nb = 0; nb < k8_northbridges.num; nb++) {
2931 if (!pvt_lookup[nb]) 2938 if (!pvt_lookup[nb])
2932 continue; 2939 continue;
2933 2940
diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h
index 613b9381e71a..044aee4f944d 100644
--- a/drivers/edac/amd64_edac.h
+++ b/drivers/edac/amd64_edac.h
@@ -72,7 +72,7 @@
72#include <linux/edac.h> 72#include <linux/edac.h>
73#include <asm/msr.h> 73#include <asm/msr.h>
74#include "edac_core.h" 74#include "edac_core.h"
75#include "edac_mce_amd.h" 75#include "mce_amd.h"
76 76
77#define amd64_printk(level, fmt, arg...) \ 77#define amd64_printk(level, fmt, arg...) \
78 edac_printk(level, "amd64", fmt, ##arg) 78 edac_printk(level, "amd64", fmt, ##arg)
@@ -482,11 +482,10 @@ extern const char *rrrr_msgs[16];
482extern const char *to_msgs[2]; 482extern const char *to_msgs[2];
483extern const char *pp_msgs[4]; 483extern const char *pp_msgs[4];
484extern const char *ii_msgs[4]; 484extern const char *ii_msgs[4];
485extern const char *ext_msgs[32];
486extern const char *htlink_msgs[8]; 485extern const char *htlink_msgs[8];
487 486
488#ifdef CONFIG_EDAC_DEBUG 487#ifdef CONFIG_EDAC_DEBUG
489#define NUM_DBG_ATTRS 9 488#define NUM_DBG_ATTRS 5
490#else 489#else
491#define NUM_DBG_ATTRS 0 490#define NUM_DBG_ATTRS 0
492#endif 491#endif
diff --git a/drivers/edac/amd64_edac_dbg.c b/drivers/edac/amd64_edac_dbg.c
index 59cf2cf6e11e..e3562288f4ce 100644
--- a/drivers/edac/amd64_edac_dbg.c
+++ b/drivers/edac/amd64_edac_dbg.c
@@ -1,167 +1,16 @@
1#include "amd64_edac.h" 1#include "amd64_edac.h"
2 2
3/* 3#define EDAC_DCT_ATTR_SHOW(reg) \
4 * accept a hex value and store it into the virtual error register file, field: 4static ssize_t amd64_##reg##_show(struct mem_ctl_info *mci, char *data) \
5 * nbeal and nbeah. Assume virtual error values have already been set for: NBSL, 5{ \
6 * NBSH and NBCFG. Then proceed to map the error values to a MC, CSROW and 6 struct amd64_pvt *pvt = mci->pvt_info; \
7 * CHANNEL 7 return sprintf(data, "0x%016llx\n", (u64)pvt->reg); \
8 */
9static ssize_t amd64_nbea_store(struct mem_ctl_info *mci, const char *data,
10 size_t count)
11{
12 struct amd64_pvt *pvt = mci->pvt_info;
13 unsigned long long value;
14 int ret = 0;
15
16 ret = strict_strtoull(data, 16, &value);
17 if (ret != -EINVAL) {
18 debugf0("received NBEA= 0x%llx\n", value);
19
20 /* place the value into the virtual error packet */
21 pvt->ctl_error_info.nbeal = (u32) value;
22 value >>= 32;
23 pvt->ctl_error_info.nbeah = (u32) value;
24
25 /* Process the Mapping request */
26 /* TODO: Add race prevention */
27 amd_decode_nb_mce(pvt->mc_node_id, &pvt->ctl_error_info, 1);
28
29 return count;
30 }
31 return ret;
32}
33
34/* display back what the last NBEA (MCA NB Address (MC4_ADDR)) was written */
35static ssize_t amd64_nbea_show(struct mem_ctl_info *mci, char *data)
36{
37 struct amd64_pvt *pvt = mci->pvt_info;
38 u64 value;
39
40 value = pvt->ctl_error_info.nbeah;
41 value <<= 32;
42 value |= pvt->ctl_error_info.nbeal;
43
44 return sprintf(data, "%llx\n", value);
45}
46
47/* store the NBSL (MCA NB Status Low (MC4_STATUS)) value user desires */
48static ssize_t amd64_nbsl_store(struct mem_ctl_info *mci, const char *data,
49 size_t count)
50{
51 struct amd64_pvt *pvt = mci->pvt_info;
52 unsigned long value;
53 int ret = 0;
54
55 ret = strict_strtoul(data, 16, &value);
56 if (ret != -EINVAL) {
57 debugf0("received NBSL= 0x%lx\n", value);
58
59 pvt->ctl_error_info.nbsl = (u32) value;
60
61 return count;
62 }
63 return ret;
64}
65
66/* display back what the last NBSL value written */
67static ssize_t amd64_nbsl_show(struct mem_ctl_info *mci, char *data)
68{
69 struct amd64_pvt *pvt = mci->pvt_info;
70 u32 value;
71
72 value = pvt->ctl_error_info.nbsl;
73
74 return sprintf(data, "%x\n", value);
75}
76
77/* store the NBSH (MCA NB Status High) value user desires */
78static ssize_t amd64_nbsh_store(struct mem_ctl_info *mci, const char *data,
79 size_t count)
80{
81 struct amd64_pvt *pvt = mci->pvt_info;
82 unsigned long value;
83 int ret = 0;
84
85 ret = strict_strtoul(data, 16, &value);
86 if (ret != -EINVAL) {
87 debugf0("received NBSH= 0x%lx\n", value);
88
89 pvt->ctl_error_info.nbsh = (u32) value;
90
91 return count;
92 }
93 return ret;
94}
95
96/* display back what the last NBSH value written */
97static ssize_t amd64_nbsh_show(struct mem_ctl_info *mci, char *data)
98{
99 struct amd64_pvt *pvt = mci->pvt_info;
100 u32 value;
101
102 value = pvt->ctl_error_info.nbsh;
103
104 return sprintf(data, "%x\n", value);
105} 8}
106 9
107/* accept and store the NBCFG (MCA NB Configuration) value user desires */ 10EDAC_DCT_ATTR_SHOW(dhar);
108static ssize_t amd64_nbcfg_store(struct mem_ctl_info *mci, 11EDAC_DCT_ATTR_SHOW(dbam0);
109 const char *data, size_t count) 12EDAC_DCT_ATTR_SHOW(top_mem);
110{ 13EDAC_DCT_ATTR_SHOW(top_mem2);
111 struct amd64_pvt *pvt = mci->pvt_info;
112 unsigned long value;
113 int ret = 0;
114
115 ret = strict_strtoul(data, 16, &value);
116 if (ret != -EINVAL) {
117 debugf0("received NBCFG= 0x%lx\n", value);
118
119 pvt->ctl_error_info.nbcfg = (u32) value;
120
121 return count;
122 }
123 return ret;
124}
125
126/* various show routines for the controls of a MCI */
127static ssize_t amd64_nbcfg_show(struct mem_ctl_info *mci, char *data)
128{
129 struct amd64_pvt *pvt = mci->pvt_info;
130
131 return sprintf(data, "%x\n", pvt->ctl_error_info.nbcfg);
132}
133
134
135static ssize_t amd64_dhar_show(struct mem_ctl_info *mci, char *data)
136{
137 struct amd64_pvt *pvt = mci->pvt_info;
138
139 return sprintf(data, "%x\n", pvt->dhar);
140}
141
142
143static ssize_t amd64_dbam_show(struct mem_ctl_info *mci, char *data)
144{
145 struct amd64_pvt *pvt = mci->pvt_info;
146
147 return sprintf(data, "%x\n", pvt->dbam0);
148}
149
150
151static ssize_t amd64_topmem_show(struct mem_ctl_info *mci, char *data)
152{
153 struct amd64_pvt *pvt = mci->pvt_info;
154
155 return sprintf(data, "%llx\n", pvt->top_mem);
156}
157
158
159static ssize_t amd64_topmem2_show(struct mem_ctl_info *mci, char *data)
160{
161 struct amd64_pvt *pvt = mci->pvt_info;
162
163 return sprintf(data, "%llx\n", pvt->top_mem2);
164}
165 14
166static ssize_t amd64_hole_show(struct mem_ctl_info *mci, char *data) 15static ssize_t amd64_hole_show(struct mem_ctl_info *mci, char *data)
167{ 16{
@@ -182,38 +31,6 @@ struct mcidev_sysfs_attribute amd64_dbg_attrs[] = {
182 31
183 { 32 {
184 .attr = { 33 .attr = {
185 .name = "nbea_ctl",
186 .mode = (S_IRUGO | S_IWUSR)
187 },
188 .show = amd64_nbea_show,
189 .store = amd64_nbea_store,
190 },
191 {
192 .attr = {
193 .name = "nbsl_ctl",
194 .mode = (S_IRUGO | S_IWUSR)
195 },
196 .show = amd64_nbsl_show,
197 .store = amd64_nbsl_store,
198 },
199 {
200 .attr = {
201 .name = "nbsh_ctl",
202 .mode = (S_IRUGO | S_IWUSR)
203 },
204 .show = amd64_nbsh_show,
205 .store = amd64_nbsh_store,
206 },
207 {
208 .attr = {
209 .name = "nbcfg_ctl",
210 .mode = (S_IRUGO | S_IWUSR)
211 },
212 .show = amd64_nbcfg_show,
213 .store = amd64_nbcfg_store,
214 },
215 {
216 .attr = {
217 .name = "dhar", 34 .name = "dhar",
218 .mode = (S_IRUGO) 35 .mode = (S_IRUGO)
219 }, 36 },
@@ -225,7 +42,7 @@ struct mcidev_sysfs_attribute amd64_dbg_attrs[] = {
225 .name = "dbam", 42 .name = "dbam",
226 .mode = (S_IRUGO) 43 .mode = (S_IRUGO)
227 }, 44 },
228 .show = amd64_dbam_show, 45 .show = amd64_dbam0_show,
229 .store = NULL, 46 .store = NULL,
230 }, 47 },
231 { 48 {
@@ -233,7 +50,7 @@ struct mcidev_sysfs_attribute amd64_dbg_attrs[] = {
233 .name = "topmem", 50 .name = "topmem",
234 .mode = (S_IRUGO) 51 .mode = (S_IRUGO)
235 }, 52 },
236 .show = amd64_topmem_show, 53 .show = amd64_top_mem_show,
237 .store = NULL, 54 .store = NULL,
238 }, 55 },
239 { 56 {
@@ -241,7 +58,7 @@ struct mcidev_sysfs_attribute amd64_dbg_attrs[] = {
241 .name = "topmem2", 58 .name = "topmem2",
242 .mode = (S_IRUGO) 59 .mode = (S_IRUGO)
243 }, 60 },
244 .show = amd64_topmem2_show, 61 .show = amd64_top_mem2_show,
245 .store = NULL, 62 .store = NULL,
246 }, 63 },
247 { 64 {
diff --git a/drivers/edac/edac_device_sysfs.c b/drivers/edac/edac_device_sysfs.c
index 070968178a24..2941dca91aae 100644
--- a/drivers/edac/edac_device_sysfs.c
+++ b/drivers/edac/edac_device_sysfs.c
@@ -13,6 +13,7 @@
13#include <linux/ctype.h> 13#include <linux/ctype.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/slab.h> 15#include <linux/slab.h>
16#include <linux/edac.h>
16 17
17#include "edac_core.h" 18#include "edac_core.h"
18#include "edac_module.h" 19#include "edac_module.h"
@@ -235,7 +236,7 @@ int edac_device_register_sysfs_main_kobj(struct edac_device_ctl_info *edac_dev)
235 debugf1("%s()\n", __func__); 236 debugf1("%s()\n", __func__);
236 237
237 /* get the /sys/devices/system/edac reference */ 238 /* get the /sys/devices/system/edac reference */
238 edac_class = edac_get_edac_class(); 239 edac_class = edac_get_sysfs_class();
239 if (edac_class == NULL) { 240 if (edac_class == NULL) {
240 debugf1("%s() no edac_class error\n", __func__); 241 debugf1("%s() no edac_class error\n", __func__);
241 err = -ENODEV; 242 err = -ENODEV;
@@ -255,7 +256,7 @@ int edac_device_register_sysfs_main_kobj(struct edac_device_ctl_info *edac_dev)
255 256
256 if (!try_module_get(edac_dev->owner)) { 257 if (!try_module_get(edac_dev->owner)) {
257 err = -ENODEV; 258 err = -ENODEV;
258 goto err_out; 259 goto err_mod_get;
259 } 260 }
260 261
261 /* register */ 262 /* register */
@@ -282,6 +283,9 @@ int edac_device_register_sysfs_main_kobj(struct edac_device_ctl_info *edac_dev)
282err_kobj_reg: 283err_kobj_reg:
283 module_put(edac_dev->owner); 284 module_put(edac_dev->owner);
284 285
286err_mod_get:
287 edac_put_sysfs_class();
288
285err_out: 289err_out:
286 return err; 290 return err;
287} 291}
@@ -290,12 +294,11 @@ err_out:
290 * edac_device_unregister_sysfs_main_kobj: 294 * edac_device_unregister_sysfs_main_kobj:
291 * the '..../edac/<name>' kobject 295 * the '..../edac/<name>' kobject
292 */ 296 */
293void edac_device_unregister_sysfs_main_kobj( 297void edac_device_unregister_sysfs_main_kobj(struct edac_device_ctl_info *dev)
294 struct edac_device_ctl_info *edac_dev)
295{ 298{
296 debugf0("%s()\n", __func__); 299 debugf0("%s()\n", __func__);
297 debugf4("%s() name of kobject is: %s\n", 300 debugf4("%s() name of kobject is: %s\n",
298 __func__, kobject_name(&edac_dev->kobj)); 301 __func__, kobject_name(&dev->kobj));
299 302
300 /* 303 /*
301 * Unregister the edac device's kobject and 304 * Unregister the edac device's kobject and
@@ -304,7 +307,8 @@ void edac_device_unregister_sysfs_main_kobj(
304 * a) module_put() this module 307 * a) module_put() this module
305 * b) 'kfree' the memory 308 * b) 'kfree' the memory
306 */ 309 */
307 kobject_put(&edac_dev->kobj); 310 kobject_put(&dev->kobj);
311 edac_put_sysfs_class();
308} 312}
309 313
310/* edac_dev -> instance information */ 314/* edac_dev -> instance information */
diff --git a/drivers/edac/edac_mc_sysfs.c b/drivers/edac/edac_mc_sysfs.c
index 8aad94d10c0c..a4135860149b 100644
--- a/drivers/edac/edac_mc_sysfs.c
+++ b/drivers/edac/edac_mc_sysfs.c
@@ -11,6 +11,7 @@
11 11
12#include <linux/ctype.h> 12#include <linux/ctype.h>
13#include <linux/slab.h> 13#include <linux/slab.h>
14#include <linux/edac.h>
14#include <linux/bug.h> 15#include <linux/bug.h>
15 16
16#include "edac_core.h" 17#include "edac_core.h"
@@ -1011,13 +1012,13 @@ void edac_remove_sysfs_mci_device(struct mem_ctl_info *mci)
1011 */ 1012 */
1012int edac_sysfs_setup_mc_kset(void) 1013int edac_sysfs_setup_mc_kset(void)
1013{ 1014{
1014 int err = 0; 1015 int err = -EINVAL;
1015 struct sysdev_class *edac_class; 1016 struct sysdev_class *edac_class;
1016 1017
1017 debugf1("%s()\n", __func__); 1018 debugf1("%s()\n", __func__);
1018 1019
1019 /* get the /sys/devices/system/edac class reference */ 1020 /* get the /sys/devices/system/edac class reference */
1020 edac_class = edac_get_edac_class(); 1021 edac_class = edac_get_sysfs_class();
1021 if (edac_class == NULL) { 1022 if (edac_class == NULL) {
1022 debugf1("%s() no edac_class error=%d\n", __func__, err); 1023 debugf1("%s() no edac_class error=%d\n", __func__, err);
1023 goto fail_out; 1024 goto fail_out;
@@ -1028,15 +1029,16 @@ int edac_sysfs_setup_mc_kset(void)
1028 if (!mc_kset) { 1029 if (!mc_kset) {
1029 err = -ENOMEM; 1030 err = -ENOMEM;
1030 debugf1("%s() Failed to register '.../edac/mc'\n", __func__); 1031 debugf1("%s() Failed to register '.../edac/mc'\n", __func__);
1031 goto fail_out; 1032 goto fail_kset;
1032 } 1033 }
1033 1034
1034 debugf1("%s() Registered '.../edac/mc' kobject\n", __func__); 1035 debugf1("%s() Registered '.../edac/mc' kobject\n", __func__);
1035 1036
1036 return 0; 1037 return 0;
1037 1038
1039fail_kset:
1040 edac_put_sysfs_class();
1038 1041
1039 /* error unwind stack */
1040fail_out: 1042fail_out:
1041 return err; 1043 return err;
1042} 1044}
@@ -1049,5 +1051,6 @@ fail_out:
1049void edac_sysfs_teardown_mc_kset(void) 1051void edac_sysfs_teardown_mc_kset(void)
1050{ 1052{
1051 kset_unregister(mc_kset); 1053 kset_unregister(mc_kset);
1054 edac_put_sysfs_class();
1052} 1055}
1053 1056
diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c
deleted file mode 100644
index 9014df6f605d..000000000000
--- a/drivers/edac/edac_mce_amd.c
+++ /dev/null
@@ -1,452 +0,0 @@
1#include <linux/module.h>
2#include "edac_mce_amd.h"
3
4static bool report_gart_errors;
5static void (*nb_bus_decoder)(int node_id, struct err_regs *regs);
6
7void amd_report_gart_errors(bool v)
8{
9 report_gart_errors = v;
10}
11EXPORT_SYMBOL_GPL(amd_report_gart_errors);
12
13void amd_register_ecc_decoder(void (*f)(int, struct err_regs *))
14{
15 nb_bus_decoder = f;
16}
17EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
18
19void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *))
20{
21 if (nb_bus_decoder) {
22 WARN_ON(nb_bus_decoder != f);
23
24 nb_bus_decoder = NULL;
25 }
26}
27EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
28
29/*
30 * string representation for the different MCA reported error types, see F3x48
31 * or MSR0000_0411.
32 */
33const char *tt_msgs[] = { /* transaction type */
34 "instruction",
35 "data",
36 "generic",
37 "reserved"
38};
39EXPORT_SYMBOL_GPL(tt_msgs);
40
41const char *ll_msgs[] = { /* cache level */
42 "L0",
43 "L1",
44 "L2",
45 "L3/generic"
46};
47EXPORT_SYMBOL_GPL(ll_msgs);
48
49const char *rrrr_msgs[] = {
50 "generic",
51 "generic read",
52 "generic write",
53 "data read",
54 "data write",
55 "inst fetch",
56 "prefetch",
57 "evict",
58 "snoop",
59 "reserved RRRR= 9",
60 "reserved RRRR= 10",
61 "reserved RRRR= 11",
62 "reserved RRRR= 12",
63 "reserved RRRR= 13",
64 "reserved RRRR= 14",
65 "reserved RRRR= 15"
66};
67EXPORT_SYMBOL_GPL(rrrr_msgs);
68
69const char *pp_msgs[] = { /* participating processor */
70 "local node originated (SRC)",
71 "local node responded to request (RES)",
72 "local node observed as 3rd party (OBS)",
73 "generic"
74};
75EXPORT_SYMBOL_GPL(pp_msgs);
76
77const char *to_msgs[] = {
78 "no timeout",
79 "timed out"
80};
81EXPORT_SYMBOL_GPL(to_msgs);
82
83const char *ii_msgs[] = { /* memory or i/o */
84 "mem access",
85 "reserved",
86 "i/o access",
87 "generic"
88};
89EXPORT_SYMBOL_GPL(ii_msgs);
90
91/*
92 * Map the 4 or 5 (family-specific) bits of Extended Error code to the
93 * string table.
94 */
95const char *ext_msgs[] = {
96 "K8 ECC error", /* 0_0000b */
97 "CRC error on link", /* 0_0001b */
98 "Sync error packets on link", /* 0_0010b */
99 "Master Abort during link operation", /* 0_0011b */
100 "Target Abort during link operation", /* 0_0100b */
101 "Invalid GART PTE entry during table walk", /* 0_0101b */
102 "Unsupported atomic RMW command received", /* 0_0110b */
103 "WDT error: NB transaction timeout", /* 0_0111b */
104 "ECC/ChipKill ECC error", /* 0_1000b */
105 "SVM DEV Error", /* 0_1001b */
106 "Link Data error", /* 0_1010b */
107 "Link/L3/Probe Filter Protocol error", /* 0_1011b */
108 "NB Internal Arrays Parity error", /* 0_1100b */
109 "DRAM Address/Control Parity error", /* 0_1101b */
110 "Link Transmission error", /* 0_1110b */
111 "GART/DEV Table Walk Data error" /* 0_1111b */
112 "Res 0x100 error", /* 1_0000b */
113 "Res 0x101 error", /* 1_0001b */
114 "Res 0x102 error", /* 1_0010b */
115 "Res 0x103 error", /* 1_0011b */
116 "Res 0x104 error", /* 1_0100b */
117 "Res 0x105 error", /* 1_0101b */
118 "Res 0x106 error", /* 1_0110b */
119 "Res 0x107 error", /* 1_0111b */
120 "Res 0x108 error", /* 1_1000b */
121 "Res 0x109 error", /* 1_1001b */
122 "Res 0x10A error", /* 1_1010b */
123 "Res 0x10B error", /* 1_1011b */
124 "ECC error in L3 Cache Data", /* 1_1100b */
125 "L3 Cache Tag error", /* 1_1101b */
126 "L3 Cache LRU Parity error", /* 1_1110b */
127 "Probe Filter error" /* 1_1111b */
128};
129EXPORT_SYMBOL_GPL(ext_msgs);
130
131static void amd_decode_dc_mce(u64 mc0_status)
132{
133 u32 ec = mc0_status & 0xffff;
134 u32 xec = (mc0_status >> 16) & 0xf;
135
136 pr_emerg("Data Cache Error");
137
138 if (xec == 1 && TLB_ERROR(ec))
139 pr_cont(": %s TLB multimatch.\n", LL_MSG(ec));
140 else if (xec == 0) {
141 if (mc0_status & (1ULL << 40))
142 pr_cont(" during Data Scrub.\n");
143 else if (TLB_ERROR(ec))
144 pr_cont(": %s TLB parity error.\n", LL_MSG(ec));
145 else if (MEM_ERROR(ec)) {
146 u8 ll = ec & 0x3;
147 u8 tt = (ec >> 2) & 0x3;
148 u8 rrrr = (ec >> 4) & 0xf;
149
150 /* see F10h BKDG (31116), Table 92. */
151 if (ll == 0x1) {
152 if (tt != 0x1)
153 goto wrong_dc_mce;
154
155 pr_cont(": Data/Tag %s error.\n", RRRR_MSG(ec));
156
157 } else if (ll == 0x2 && rrrr == 0x3)
158 pr_cont(" during L1 linefill from L2.\n");
159 else
160 goto wrong_dc_mce;
161 } else if (BUS_ERROR(ec) && boot_cpu_data.x86 == 0xf)
162 pr_cont(" during system linefill.\n");
163 else
164 goto wrong_dc_mce;
165 } else
166 goto wrong_dc_mce;
167
168 return;
169
170wrong_dc_mce:
171 pr_warning("Corrupted DC MCE info?\n");
172}
173
174static void amd_decode_ic_mce(u64 mc1_status)
175{
176 u32 ec = mc1_status & 0xffff;
177 u32 xec = (mc1_status >> 16) & 0xf;
178
179 pr_emerg("Instruction Cache Error");
180
181 if (xec == 1 && TLB_ERROR(ec))
182 pr_cont(": %s TLB multimatch.\n", LL_MSG(ec));
183 else if (xec == 0) {
184 if (TLB_ERROR(ec))
185 pr_cont(": %s TLB Parity error.\n", LL_MSG(ec));
186 else if (BUS_ERROR(ec)) {
187 if (boot_cpu_data.x86 == 0xf &&
188 (mc1_status & (1ULL << 58)))
189 pr_cont(" during system linefill.\n");
190 else
191 pr_cont(" during attempted NB data read.\n");
192 } else if (MEM_ERROR(ec)) {
193 u8 ll = ec & 0x3;
194 u8 rrrr = (ec >> 4) & 0xf;
195
196 if (ll == 0x2)
197 pr_cont(" during a linefill from L2.\n");
198 else if (ll == 0x1) {
199
200 switch (rrrr) {
201 case 0x5:
202 pr_cont(": Parity error during "
203 "data load.\n");
204 break;
205
206 case 0x7:
207 pr_cont(": Copyback Parity/Victim"
208 " error.\n");
209 break;
210
211 case 0x8:
212 pr_cont(": Tag Snoop error.\n");
213 break;
214
215 default:
216 goto wrong_ic_mce;
217 break;
218 }
219 }
220 } else
221 goto wrong_ic_mce;
222 } else
223 goto wrong_ic_mce;
224
225 return;
226
227wrong_ic_mce:
228 pr_warning("Corrupted IC MCE info?\n");
229}
230
231static void amd_decode_bu_mce(u64 mc2_status)
232{
233 u32 ec = mc2_status & 0xffff;
234 u32 xec = (mc2_status >> 16) & 0xf;
235
236 pr_emerg("Bus Unit Error");
237
238 if (xec == 0x1)
239 pr_cont(" in the write data buffers.\n");
240 else if (xec == 0x3)
241 pr_cont(" in the victim data buffers.\n");
242 else if (xec == 0x2 && MEM_ERROR(ec))
243 pr_cont(": %s error in the L2 cache tags.\n", RRRR_MSG(ec));
244 else if (xec == 0x0) {
245 if (TLB_ERROR(ec))
246 pr_cont(": %s error in a Page Descriptor Cache or "
247 "Guest TLB.\n", TT_MSG(ec));
248 else if (BUS_ERROR(ec))
249 pr_cont(": %s/ECC error in data read from NB: %s.\n",
250 RRRR_MSG(ec), PP_MSG(ec));
251 else if (MEM_ERROR(ec)) {
252 u8 rrrr = (ec >> 4) & 0xf;
253
254 if (rrrr >= 0x7)
255 pr_cont(": %s error during data copyback.\n",
256 RRRR_MSG(ec));
257 else if (rrrr <= 0x1)
258 pr_cont(": %s parity/ECC error during data "
259 "access from L2.\n", RRRR_MSG(ec));
260 else
261 goto wrong_bu_mce;
262 } else
263 goto wrong_bu_mce;
264 } else
265 goto wrong_bu_mce;
266
267 return;
268
269wrong_bu_mce:
270 pr_warning("Corrupted BU MCE info?\n");
271}
272
273static void amd_decode_ls_mce(u64 mc3_status)
274{
275 u32 ec = mc3_status & 0xffff;
276 u32 xec = (mc3_status >> 16) & 0xf;
277
278 pr_emerg("Load Store Error");
279
280 if (xec == 0x0) {
281 u8 rrrr = (ec >> 4) & 0xf;
282
283 if (!BUS_ERROR(ec) || (rrrr != 0x3 && rrrr != 0x4))
284 goto wrong_ls_mce;
285
286 pr_cont(" during %s.\n", RRRR_MSG(ec));
287 }
288 return;
289
290wrong_ls_mce:
291 pr_warning("Corrupted LS MCE info?\n");
292}
293
294void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors)
295{
296 u32 ec = ERROR_CODE(regs->nbsl);
297
298 if (!handle_errors)
299 return;
300
301 /*
302 * GART TLB error reporting is disabled by default. Bail out early.
303 */
304 if (TLB_ERROR(ec) && !report_gart_errors)
305 return;
306
307 pr_emerg("Northbridge Error, node %d", node_id);
308
309 /*
310 * F10h, revD can disable ErrCpu[3:0] so check that first and also the
311 * value encoding has changed so interpret those differently
312 */
313 if ((boot_cpu_data.x86 == 0x10) &&
314 (boot_cpu_data.x86_model > 7)) {
315 if (regs->nbsh & K8_NBSH_ERR_CPU_VAL)
316 pr_cont(", core: %u\n", (u8)(regs->nbsh & 0xf));
317 } else {
318 u8 assoc_cpus = regs->nbsh & 0xf;
319
320 if (assoc_cpus > 0)
321 pr_cont(", core: %d", fls(assoc_cpus) - 1);
322
323 pr_cont("\n");
324 }
325
326 pr_emerg("%s.\n", EXT_ERR_MSG(regs->nbsl));
327
328 if (BUS_ERROR(ec) && nb_bus_decoder)
329 nb_bus_decoder(node_id, regs);
330}
331EXPORT_SYMBOL_GPL(amd_decode_nb_mce);
332
333static void amd_decode_fr_mce(u64 mc5_status)
334{
335 /* we have only one error signature so match all fields at once. */
336 if ((mc5_status & 0xffff) == 0x0f0f)
337 pr_emerg(" FR Error: CPU Watchdog timer expire.\n");
338 else
339 pr_warning("Corrupted FR MCE info?\n");
340}
341
342static inline void amd_decode_err_code(unsigned int ec)
343{
344 if (TLB_ERROR(ec)) {
345 pr_emerg("Transaction: %s, Cache Level %s\n",
346 TT_MSG(ec), LL_MSG(ec));
347 } else if (MEM_ERROR(ec)) {
348 pr_emerg("Transaction: %s, Type: %s, Cache Level: %s",
349 RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec));
350 } else if (BUS_ERROR(ec)) {
351 pr_emerg("Transaction type: %s(%s), %s, Cache Level: %s, "
352 "Participating Processor: %s\n",
353 RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec),
354 PP_MSG(ec));
355 } else
356 pr_warning("Huh? Unknown MCE error 0x%x\n", ec);
357}
358
359static int amd_decode_mce(struct notifier_block *nb, unsigned long val,
360 void *data)
361{
362 struct mce *m = (struct mce *)data;
363 struct err_regs regs;
364 int node, ecc;
365
366 pr_emerg("MC%d_STATUS: ", m->bank);
367
368 pr_cont("%sorrected error, other errors lost: %s, "
369 "CPU context corrupt: %s",
370 ((m->status & MCI_STATUS_UC) ? "Unc" : "C"),
371 ((m->status & MCI_STATUS_OVER) ? "yes" : "no"),
372 ((m->status & MCI_STATUS_PCC) ? "yes" : "no"));
373
374 /* do the two bits[14:13] together */
375 ecc = (m->status >> 45) & 0x3;
376 if (ecc)
377 pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U"));
378
379 pr_cont("\n");
380
381 switch (m->bank) {
382 case 0:
383 amd_decode_dc_mce(m->status);
384 break;
385
386 case 1:
387 amd_decode_ic_mce(m->status);
388 break;
389
390 case 2:
391 amd_decode_bu_mce(m->status);
392 break;
393
394 case 3:
395 amd_decode_ls_mce(m->status);
396 break;
397
398 case 4:
399 regs.nbsl = (u32) m->status;
400 regs.nbsh = (u32)(m->status >> 32);
401 regs.nbeal = (u32) m->addr;
402 regs.nbeah = (u32)(m->addr >> 32);
403 node = amd_get_nb_id(m->extcpu);
404
405 amd_decode_nb_mce(node, &regs, 1);
406 break;
407
408 case 5:
409 amd_decode_fr_mce(m->status);
410 break;
411
412 default:
413 break;
414 }
415
416 amd_decode_err_code(m->status & 0xffff);
417
418 return NOTIFY_STOP;
419}
420
421static struct notifier_block amd_mce_dec_nb = {
422 .notifier_call = amd_decode_mce,
423};
424
425static int __init mce_amd_init(void)
426{
427 /*
428 * We can decode MCEs for K8, F10h and F11h CPUs:
429 */
430 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
431 return 0;
432
433 if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11)
434 return 0;
435
436 atomic_notifier_chain_register(&x86_mce_decoder_chain, &amd_mce_dec_nb);
437
438 return 0;
439}
440early_initcall(mce_amd_init);
441
442#ifdef MODULE
443static void __exit mce_amd_exit(void)
444{
445 atomic_notifier_chain_unregister(&x86_mce_decoder_chain, &amd_mce_dec_nb);
446}
447
448MODULE_DESCRIPTION("AMD MCE decoder");
449MODULE_ALIAS("edac-mce-amd");
450MODULE_LICENSE("GPL");
451module_exit(mce_amd_exit);
452#endif
diff --git a/drivers/edac/edac_module.c b/drivers/edac/edac_module.c
index 7e1374afd967..be4b075c3098 100644
--- a/drivers/edac/edac_module.c
+++ b/drivers/edac/edac_module.c
@@ -27,15 +27,6 @@ EXPORT_SYMBOL_GPL(edac_debug_level);
27struct workqueue_struct *edac_workqueue; 27struct workqueue_struct *edac_workqueue;
28 28
29/* 29/*
30 * sysfs object: /sys/devices/system/edac
31 * need to export to other files in this modules
32 */
33static struct sysdev_class edac_class = {
34 .name = "edac",
35};
36static int edac_class_valid;
37
38/*
39 * edac_op_state_to_string() 30 * edac_op_state_to_string()
40 */ 31 */
41char *edac_op_state_to_string(int opstate) 32char *edac_op_state_to_string(int opstate)
@@ -55,60 +46,6 @@ char *edac_op_state_to_string(int opstate)
55} 46}
56 47
57/* 48/*
58 * edac_get_edac_class()
59 *
60 * return pointer to the edac class of 'edac'
61 */
62struct sysdev_class *edac_get_edac_class(void)
63{
64 struct sysdev_class *classptr = NULL;
65
66 if (edac_class_valid)
67 classptr = &edac_class;
68
69 return classptr;
70}
71
72/*
73 * edac_register_sysfs_edac_name()
74 *
75 * register the 'edac' into /sys/devices/system
76 *
77 * return:
78 * 0 success
79 * !0 error
80 */
81static int edac_register_sysfs_edac_name(void)
82{
83 int err;
84
85 /* create the /sys/devices/system/edac directory */
86 err = sysdev_class_register(&edac_class);
87
88 if (err) {
89 debugf1("%s() error=%d\n", __func__, err);
90 return err;
91 }
92
93 edac_class_valid = 1;
94 return 0;
95}
96
97/*
98 * sysdev_class_unregister()
99 *
100 * unregister the 'edac' from /sys/devices/system
101 */
102static void edac_unregister_sysfs_edac_name(void)
103{
104 /* only if currently registered, then unregister it */
105 if (edac_class_valid)
106 sysdev_class_unregister(&edac_class);
107
108 edac_class_valid = 0;
109}
110
111/*
112 * edac_workqueue_setup 49 * edac_workqueue_setup
113 * initialize the edac work queue for polling operations 50 * initialize the edac work queue for polling operations
114 */ 51 */
@@ -154,21 +91,11 @@ static int __init edac_init(void)
154 edac_pci_clear_parity_errors(); 91 edac_pci_clear_parity_errors();
155 92
156 /* 93 /*
157 * perform the registration of the /sys/devices/system/edac class object
158 */
159 if (edac_register_sysfs_edac_name()) {
160 edac_printk(KERN_ERR, EDAC_MC,
161 "Error initializing 'edac' kobject\n");
162 err = -ENODEV;
163 goto error;
164 }
165
166 /*
167 * now set up the mc_kset under the edac class object 94 * now set up the mc_kset under the edac class object
168 */ 95 */
169 err = edac_sysfs_setup_mc_kset(); 96 err = edac_sysfs_setup_mc_kset();
170 if (err) 97 if (err)
171 goto sysfs_setup_fail; 98 goto error;
172 99
173 /* Setup/Initialize the workq for this core */ 100 /* Setup/Initialize the workq for this core */
174 err = edac_workqueue_setup(); 101 err = edac_workqueue_setup();
@@ -183,9 +110,6 @@ static int __init edac_init(void)
183workq_fail: 110workq_fail:
184 edac_sysfs_teardown_mc_kset(); 111 edac_sysfs_teardown_mc_kset();
185 112
186sysfs_setup_fail:
187 edac_unregister_sysfs_edac_name();
188
189error: 113error:
190 return err; 114 return err;
191} 115}
@@ -201,7 +125,6 @@ static void __exit edac_exit(void)
201 /* tear down the various subsystems */ 125 /* tear down the various subsystems */
202 edac_workqueue_teardown(); 126 edac_workqueue_teardown();
203 edac_sysfs_teardown_mc_kset(); 127 edac_sysfs_teardown_mc_kset();
204 edac_unregister_sysfs_edac_name();
205} 128}
206 129
207/* 130/*
diff --git a/drivers/edac/edac_module.h b/drivers/edac/edac_module.h
index 233d4798c3aa..17aabb7b90ec 100644
--- a/drivers/edac/edac_module.h
+++ b/drivers/edac/edac_module.h
@@ -42,7 +42,6 @@ extern void edac_device_unregister_sysfs_main_kobj(
42 struct edac_device_ctl_info *edac_dev); 42 struct edac_device_ctl_info *edac_dev);
43extern int edac_device_create_sysfs(struct edac_device_ctl_info *edac_dev); 43extern int edac_device_create_sysfs(struct edac_device_ctl_info *edac_dev);
44extern void edac_device_remove_sysfs(struct edac_device_ctl_info *edac_dev); 44extern void edac_device_remove_sysfs(struct edac_device_ctl_info *edac_dev);
45extern struct sysdev_class *edac_get_edac_class(void);
46 45
47/* edac core workqueue: single CPU mode */ 46/* edac core workqueue: single CPU mode */
48extern struct workqueue_struct *edac_workqueue; 47extern struct workqueue_struct *edac_workqueue;
diff --git a/drivers/edac/edac_pci_sysfs.c b/drivers/edac/edac_pci_sysfs.c
index c39697df9cb4..023b01cb5175 100644
--- a/drivers/edac/edac_pci_sysfs.c
+++ b/drivers/edac/edac_pci_sysfs.c
@@ -7,7 +7,7 @@
7 * 7 *
8 */ 8 */
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/sysdev.h> 10#include <linux/edac.h>
11#include <linux/slab.h> 11#include <linux/slab.h>
12#include <linux/ctype.h> 12#include <linux/ctype.h>
13 13
@@ -354,7 +354,7 @@ static int edac_pci_main_kobj_setup(void)
354 /* First time, so create the main kobject and its 354 /* First time, so create the main kobject and its
355 * controls and atributes 355 * controls and atributes
356 */ 356 */
357 edac_class = edac_get_edac_class(); 357 edac_class = edac_get_sysfs_class();
358 if (edac_class == NULL) { 358 if (edac_class == NULL) {
359 debugf1("%s() no edac_class\n", __func__); 359 debugf1("%s() no edac_class\n", __func__);
360 err = -ENODEV; 360 err = -ENODEV;
@@ -368,7 +368,7 @@ static int edac_pci_main_kobj_setup(void)
368 if (!try_module_get(THIS_MODULE)) { 368 if (!try_module_get(THIS_MODULE)) {
369 debugf1("%s() try_module_get() failed\n", __func__); 369 debugf1("%s() try_module_get() failed\n", __func__);
370 err = -ENODEV; 370 err = -ENODEV;
371 goto decrement_count_fail; 371 goto mod_get_fail;
372 } 372 }
373 373
374 edac_pci_top_main_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL); 374 edac_pci_top_main_kobj = kzalloc(sizeof(struct kobject), GFP_KERNEL);
@@ -403,6 +403,9 @@ kobject_init_and_add_fail:
403kzalloc_fail: 403kzalloc_fail:
404 module_put(THIS_MODULE); 404 module_put(THIS_MODULE);
405 405
406mod_get_fail:
407 edac_put_sysfs_class();
408
406decrement_count_fail: 409decrement_count_fail:
407 /* if are on this error exit, nothing to tear down */ 410 /* if are on this error exit, nothing to tear down */
408 atomic_dec(&edac_pci_sysfs_refcount); 411 atomic_dec(&edac_pci_sysfs_refcount);
@@ -429,6 +432,7 @@ static void edac_pci_main_kobj_teardown(void)
429 __func__); 432 __func__);
430 kobject_put(edac_pci_top_main_kobj); 433 kobject_put(edac_pci_top_main_kobj);
431 } 434 }
435 edac_put_sysfs_class();
432} 436}
433 437
434/* 438/*
diff --git a/drivers/edac/edac_stub.c b/drivers/edac/edac_stub.c
index 20b428aa155e..aab970760b75 100644
--- a/drivers/edac/edac_stub.c
+++ b/drivers/edac/edac_stub.c
@@ -3,10 +3,13 @@
3 * 3 *
4 * Author: Dave Jiang <djiang@mvista.com> 4 * Author: Dave Jiang <djiang@mvista.com>
5 * 5 *
6 * 2007 (c) MontaVista Software, Inc. This file is licensed under 6 * 2007 (c) MontaVista Software, Inc.
7 * the terms of the GNU General Public License version 2. This program 7 * 2010 (c) Advanced Micro Devices Inc.
8 * is licensed "as is" without any warranty of any kind, whether express 8 * Borislav Petkov <borislav.petkov@amd.com>
9 * or implied. 9 *
10 * This file is licensed under the terms of the GNU General Public
11 * License version 2. This program is licensed "as is" without any
12 * warranty of any kind, whether express or implied.
10 * 13 *
11 */ 14 */
12#include <linux/module.h> 15#include <linux/module.h>
@@ -23,6 +26,8 @@ EXPORT_SYMBOL_GPL(edac_handlers);
23int edac_err_assert = 0; 26int edac_err_assert = 0;
24EXPORT_SYMBOL_GPL(edac_err_assert); 27EXPORT_SYMBOL_GPL(edac_err_assert);
25 28
29static atomic_t edac_class_valid = ATOMIC_INIT(0);
30
26/* 31/*
27 * called to determine if there is an EDAC driver interested in 32 * called to determine if there is an EDAC driver interested in
28 * knowing an event (such as NMI) occurred 33 * knowing an event (such as NMI) occurred
@@ -44,3 +49,41 @@ void edac_atomic_assert_error(void)
44 edac_err_assert++; 49 edac_err_assert++;
45} 50}
46EXPORT_SYMBOL_GPL(edac_atomic_assert_error); 51EXPORT_SYMBOL_GPL(edac_atomic_assert_error);
52
53/*
54 * sysfs object: /sys/devices/system/edac
55 * need to export to other files
56 */
57struct sysdev_class edac_class = {
58 .name = "edac",
59};
60EXPORT_SYMBOL_GPL(edac_class);
61
62/* return pointer to the 'edac' node in sysfs */
63struct sysdev_class *edac_get_sysfs_class(void)
64{
65 int err = 0;
66
67 if (atomic_read(&edac_class_valid))
68 goto out;
69
70 /* create the /sys/devices/system/edac directory */
71 err = sysdev_class_register(&edac_class);
72 if (err) {
73 printk(KERN_ERR "Error registering toplevel EDAC sysfs dir\n");
74 return NULL;
75 }
76
77out:
78 atomic_inc(&edac_class_valid);
79 return &edac_class;
80}
81EXPORT_SYMBOL_GPL(edac_get_sysfs_class);
82
83void edac_put_sysfs_class(void)
84{
85 /* last user unregisters it */
86 if (atomic_dec_and_test(&edac_class_valid))
87 sysdev_class_unregister(&edac_class);
88}
89EXPORT_SYMBOL_GPL(edac_put_sysfs_class);
diff --git a/drivers/edac/mce_amd.c b/drivers/edac/mce_amd.c
new file mode 100644
index 000000000000..c0181093b490
--- /dev/null
+++ b/drivers/edac/mce_amd.c
@@ -0,0 +1,680 @@
1#include <linux/module.h>
2#include <linux/slab.h>
3
4#include "mce_amd.h"
5
6static struct amd_decoder_ops *fam_ops;
7
8static u8 nb_err_cpumask = 0xf;
9
10static bool report_gart_errors;
11static void (*nb_bus_decoder)(int node_id, struct mce *m, u32 nbcfg);
12
13void amd_report_gart_errors(bool v)
14{
15 report_gart_errors = v;
16}
17EXPORT_SYMBOL_GPL(amd_report_gart_errors);
18
19void amd_register_ecc_decoder(void (*f)(int, struct mce *, u32))
20{
21 nb_bus_decoder = f;
22}
23EXPORT_SYMBOL_GPL(amd_register_ecc_decoder);
24
25void amd_unregister_ecc_decoder(void (*f)(int, struct mce *, u32))
26{
27 if (nb_bus_decoder) {
28 WARN_ON(nb_bus_decoder != f);
29
30 nb_bus_decoder = NULL;
31 }
32}
33EXPORT_SYMBOL_GPL(amd_unregister_ecc_decoder);
34
35/*
36 * string representation for the different MCA reported error types, see F3x48
37 * or MSR0000_0411.
38 */
39
40/* transaction type */
41const char *tt_msgs[] = { "INSN", "DATA", "GEN", "RESV" };
42EXPORT_SYMBOL_GPL(tt_msgs);
43
44/* cache level */
45const char *ll_msgs[] = { "RESV", "L1", "L2", "L3/GEN" };
46EXPORT_SYMBOL_GPL(ll_msgs);
47
48/* memory transaction type */
49const char *rrrr_msgs[] = {
50 "GEN", "RD", "WR", "DRD", "DWR", "IRD", "PRF", "EV", "SNP"
51};
52EXPORT_SYMBOL_GPL(rrrr_msgs);
53
54/* participating processor */
55const char *pp_msgs[] = { "SRC", "RES", "OBS", "GEN" };
56EXPORT_SYMBOL_GPL(pp_msgs);
57
58/* request timeout */
59const char *to_msgs[] = { "no timeout", "timed out" };
60EXPORT_SYMBOL_GPL(to_msgs);
61
62/* memory or i/o */
63const char *ii_msgs[] = { "MEM", "RESV", "IO", "GEN" };
64EXPORT_SYMBOL_GPL(ii_msgs);
65
66static const char *f10h_nb_mce_desc[] = {
67 "HT link data error",
68 "Protocol error (link, L3, probe filter, etc.)",
69 "Parity error in NB-internal arrays",
70 "Link Retry due to IO link transmission error",
71 "L3 ECC data cache error",
72 "ECC error in L3 cache tag",
73 "L3 LRU parity bits error",
74 "ECC Error in the Probe Filter directory"
75};
76
77static bool f12h_dc_mce(u16 ec)
78{
79 bool ret = false;
80
81 if (MEM_ERROR(ec)) {
82 u8 ll = ec & 0x3;
83 ret = true;
84
85 if (ll == LL_L2)
86 pr_cont("during L1 linefill from L2.\n");
87 else if (ll == LL_L1)
88 pr_cont("Data/Tag %s error.\n", RRRR_MSG(ec));
89 else
90 ret = false;
91 }
92 return ret;
93}
94
95static bool f10h_dc_mce(u16 ec)
96{
97 u8 r4 = (ec >> 4) & 0xf;
98 u8 ll = ec & 0x3;
99
100 if (r4 == R4_GEN && ll == LL_L1) {
101 pr_cont("during data scrub.\n");
102 return true;
103 }
104 return f12h_dc_mce(ec);
105}
106
107static bool k8_dc_mce(u16 ec)
108{
109 if (BUS_ERROR(ec)) {
110 pr_cont("during system linefill.\n");
111 return true;
112 }
113
114 return f10h_dc_mce(ec);
115}
116
117static bool f14h_dc_mce(u16 ec)
118{
119 u8 r4 = (ec >> 4) & 0xf;
120 u8 ll = ec & 0x3;
121 u8 tt = (ec >> 2) & 0x3;
122 u8 ii = tt;
123 bool ret = true;
124
125 if (MEM_ERROR(ec)) {
126
127 if (tt != TT_DATA || ll != LL_L1)
128 return false;
129
130 switch (r4) {
131 case R4_DRD:
132 case R4_DWR:
133 pr_cont("Data/Tag parity error due to %s.\n",
134 (r4 == R4_DRD ? "load/hw prf" : "store"));
135 break;
136 case R4_EVICT:
137 pr_cont("Copyback parity error on a tag miss.\n");
138 break;
139 case R4_SNOOP:
140 pr_cont("Tag parity error during snoop.\n");
141 break;
142 default:
143 ret = false;
144 }
145 } else if (BUS_ERROR(ec)) {
146
147 if ((ii != II_MEM && ii != II_IO) || ll != LL_LG)
148 return false;
149
150 pr_cont("System read data error on a ");
151
152 switch (r4) {
153 case R4_RD:
154 pr_cont("TLB reload.\n");
155 break;
156 case R4_DWR:
157 pr_cont("store.\n");
158 break;
159 case R4_DRD:
160 pr_cont("load.\n");
161 break;
162 default:
163 ret = false;
164 }
165 } else {
166 ret = false;
167 }
168
169 return ret;
170}
171
172static void amd_decode_dc_mce(struct mce *m)
173{
174 u16 ec = m->status & 0xffff;
175 u8 xec = (m->status >> 16) & 0xf;
176
177 pr_emerg(HW_ERR "Data Cache Error: ");
178
179 /* TLB error signatures are the same across families */
180 if (TLB_ERROR(ec)) {
181 u8 tt = (ec >> 2) & 0x3;
182
183 if (tt == TT_DATA) {
184 pr_cont("%s TLB %s.\n", LL_MSG(ec),
185 (xec ? "multimatch" : "parity error"));
186 return;
187 }
188 else
189 goto wrong_dc_mce;
190 }
191
192 if (!fam_ops->dc_mce(ec))
193 goto wrong_dc_mce;
194
195 return;
196
197wrong_dc_mce:
198 pr_emerg(HW_ERR "Corrupted DC MCE info?\n");
199}
200
201static bool k8_ic_mce(u16 ec)
202{
203 u8 ll = ec & 0x3;
204 u8 r4 = (ec >> 4) & 0xf;
205 bool ret = true;
206
207 if (!MEM_ERROR(ec))
208 return false;
209
210 if (ll == 0x2)
211 pr_cont("during a linefill from L2.\n");
212 else if (ll == 0x1) {
213 switch (r4) {
214 case R4_IRD:
215 pr_cont("Parity error during data load.\n");
216 break;
217
218 case R4_EVICT:
219 pr_cont("Copyback Parity/Victim error.\n");
220 break;
221
222 case R4_SNOOP:
223 pr_cont("Tag Snoop error.\n");
224 break;
225
226 default:
227 ret = false;
228 break;
229 }
230 } else
231 ret = false;
232
233 return ret;
234}
235
236static bool f14h_ic_mce(u16 ec)
237{
238 u8 ll = ec & 0x3;
239 u8 tt = (ec >> 2) & 0x3;
240 u8 r4 = (ec >> 4) & 0xf;
241 bool ret = true;
242
243 if (MEM_ERROR(ec)) {
244 if (tt != 0 || ll != 1)
245 ret = false;
246
247 if (r4 == R4_IRD)
248 pr_cont("Data/tag array parity error for a tag hit.\n");
249 else if (r4 == R4_SNOOP)
250 pr_cont("Tag error during snoop/victimization.\n");
251 else
252 ret = false;
253 }
254 return ret;
255}
256
257static void amd_decode_ic_mce(struct mce *m)
258{
259 u16 ec = m->status & 0xffff;
260 u8 xec = (m->status >> 16) & 0xf;
261
262 pr_emerg(HW_ERR "Instruction Cache Error: ");
263
264 if (TLB_ERROR(ec))
265 pr_cont("%s TLB %s.\n", LL_MSG(ec),
266 (xec ? "multimatch" : "parity error"));
267 else if (BUS_ERROR(ec)) {
268 bool k8 = (boot_cpu_data.x86 == 0xf && (m->status & BIT_64(58)));
269
270 pr_cont("during %s.\n", (k8 ? "system linefill" : "NB data read"));
271 } else if (fam_ops->ic_mce(ec))
272 ;
273 else
274 pr_emerg(HW_ERR "Corrupted IC MCE info?\n");
275}
276
277static void amd_decode_bu_mce(struct mce *m)
278{
279 u32 ec = m->status & 0xffff;
280 u32 xec = (m->status >> 16) & 0xf;
281
282 pr_emerg(HW_ERR "Bus Unit Error");
283
284 if (xec == 0x1)
285 pr_cont(" in the write data buffers.\n");
286 else if (xec == 0x3)
287 pr_cont(" in the victim data buffers.\n");
288 else if (xec == 0x2 && MEM_ERROR(ec))
289 pr_cont(": %s error in the L2 cache tags.\n", RRRR_MSG(ec));
290 else if (xec == 0x0) {
291 if (TLB_ERROR(ec))
292 pr_cont(": %s error in a Page Descriptor Cache or "
293 "Guest TLB.\n", TT_MSG(ec));
294 else if (BUS_ERROR(ec))
295 pr_cont(": %s/ECC error in data read from NB: %s.\n",
296 RRRR_MSG(ec), PP_MSG(ec));
297 else if (MEM_ERROR(ec)) {
298 u8 rrrr = (ec >> 4) & 0xf;
299
300 if (rrrr >= 0x7)
301 pr_cont(": %s error during data copyback.\n",
302 RRRR_MSG(ec));
303 else if (rrrr <= 0x1)
304 pr_cont(": %s parity/ECC error during data "
305 "access from L2.\n", RRRR_MSG(ec));
306 else
307 goto wrong_bu_mce;
308 } else
309 goto wrong_bu_mce;
310 } else
311 goto wrong_bu_mce;
312
313 return;
314
315wrong_bu_mce:
316 pr_emerg(HW_ERR "Corrupted BU MCE info?\n");
317}
318
319static void amd_decode_ls_mce(struct mce *m)
320{
321 u16 ec = m->status & 0xffff;
322 u8 xec = (m->status >> 16) & 0xf;
323
324 if (boot_cpu_data.x86 == 0x14) {
325 pr_emerg("You shouldn't be seeing an LS MCE on this cpu family,"
326 " please report on LKML.\n");
327 return;
328 }
329
330 pr_emerg(HW_ERR "Load Store Error");
331
332 if (xec == 0x0) {
333 u8 r4 = (ec >> 4) & 0xf;
334
335 if (!BUS_ERROR(ec) || (r4 != R4_DRD && r4 != R4_DWR))
336 goto wrong_ls_mce;
337
338 pr_cont(" during %s.\n", RRRR_MSG(ec));
339 } else
340 goto wrong_ls_mce;
341
342 return;
343
344wrong_ls_mce:
345 pr_emerg(HW_ERR "Corrupted LS MCE info?\n");
346}
347
348static bool k8_nb_mce(u16 ec, u8 xec)
349{
350 bool ret = true;
351
352 switch (xec) {
353 case 0x1:
354 pr_cont("CRC error detected on HT link.\n");
355 break;
356
357 case 0x5:
358 pr_cont("Invalid GART PTE entry during GART table walk.\n");
359 break;
360
361 case 0x6:
362 pr_cont("Unsupported atomic RMW received from an IO link.\n");
363 break;
364
365 case 0x0:
366 case 0x8:
367 if (boot_cpu_data.x86 == 0x11)
368 return false;
369
370 pr_cont("DRAM ECC error detected on the NB.\n");
371 break;
372
373 case 0xd:
374 pr_cont("Parity error on the DRAM addr/ctl signals.\n");
375 break;
376
377 default:
378 ret = false;
379 break;
380 }
381
382 return ret;
383}
384
385static bool f10h_nb_mce(u16 ec, u8 xec)
386{
387 bool ret = true;
388 u8 offset = 0;
389
390 if (k8_nb_mce(ec, xec))
391 return true;
392
393 switch(xec) {
394 case 0xa ... 0xc:
395 offset = 10;
396 break;
397
398 case 0xe:
399 offset = 11;
400 break;
401
402 case 0xf:
403 if (TLB_ERROR(ec))
404 pr_cont("GART Table Walk data error.\n");
405 else if (BUS_ERROR(ec))
406 pr_cont("DMA Exclusion Vector Table Walk error.\n");
407 else
408 ret = false;
409
410 goto out;
411 break;
412
413 case 0x1c ... 0x1f:
414 offset = 24;
415 break;
416
417 default:
418 ret = false;
419
420 goto out;
421 break;
422 }
423
424 pr_cont("%s.\n", f10h_nb_mce_desc[xec - offset]);
425
426out:
427 return ret;
428}
429
430static bool nb_noop_mce(u16 ec, u8 xec)
431{
432 return false;
433}
434
435void amd_decode_nb_mce(int node_id, struct mce *m, u32 nbcfg)
436{
437 u8 xec = (m->status >> 16) & 0x1f;
438 u16 ec = m->status & 0xffff;
439 u32 nbsh = (u32)(m->status >> 32);
440
441 pr_emerg(HW_ERR "Northbridge Error, node %d: ", node_id);
442
443 /*
444 * F10h, revD can disable ErrCpu[3:0] so check that first and also the
445 * value encoding has changed so interpret those differently
446 */
447 if ((boot_cpu_data.x86 == 0x10) &&
448 (boot_cpu_data.x86_model > 7)) {
449 if (nbsh & K8_NBSH_ERR_CPU_VAL)
450 pr_cont(", core: %u", (u8)(nbsh & nb_err_cpumask));
451 } else {
452 u8 assoc_cpus = nbsh & nb_err_cpumask;
453
454 if (assoc_cpus > 0)
455 pr_cont(", core: %d", fls(assoc_cpus) - 1);
456 }
457
458 switch (xec) {
459 case 0x2:
460 pr_cont("Sync error (sync packets on HT link detected).\n");
461 return;
462
463 case 0x3:
464 pr_cont("HT Master abort.\n");
465 return;
466
467 case 0x4:
468 pr_cont("HT Target abort.\n");
469 return;
470
471 case 0x7:
472 pr_cont("NB Watchdog timeout.\n");
473 return;
474
475 case 0x9:
476 pr_cont("SVM DMA Exclusion Vector error.\n");
477 return;
478
479 default:
480 break;
481 }
482
483 if (!fam_ops->nb_mce(ec, xec))
484 goto wrong_nb_mce;
485
486 if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10)
487 if ((xec == 0x8 || xec == 0x0) && nb_bus_decoder)
488 nb_bus_decoder(node_id, m, nbcfg);
489
490 return;
491
492wrong_nb_mce:
493 pr_emerg(HW_ERR "Corrupted NB MCE info?\n");
494}
495EXPORT_SYMBOL_GPL(amd_decode_nb_mce);
496
497static void amd_decode_fr_mce(struct mce *m)
498{
499 if (boot_cpu_data.x86 == 0xf ||
500 boot_cpu_data.x86 == 0x11)
501 goto wrong_fr_mce;
502
503 /* we have only one error signature so match all fields at once. */
504 if ((m->status & 0xffff) == 0x0f0f) {
505 pr_emerg(HW_ERR "FR Error: CPU Watchdog timer expire.\n");
506 return;
507 }
508
509wrong_fr_mce:
510 pr_emerg(HW_ERR "Corrupted FR MCE info?\n");
511}
512
513static inline void amd_decode_err_code(u16 ec)
514{
515 if (TLB_ERROR(ec)) {
516 pr_emerg(HW_ERR "Transaction: %s, Cache Level: %s\n",
517 TT_MSG(ec), LL_MSG(ec));
518 } else if (MEM_ERROR(ec)) {
519 pr_emerg(HW_ERR "Transaction: %s, Type: %s, Cache Level: %s\n",
520 RRRR_MSG(ec), TT_MSG(ec), LL_MSG(ec));
521 } else if (BUS_ERROR(ec)) {
522 pr_emerg(HW_ERR "Transaction: %s (%s), %s, Cache Level: %s, "
523 "Participating Processor: %s\n",
524 RRRR_MSG(ec), II_MSG(ec), TO_MSG(ec), LL_MSG(ec),
525 PP_MSG(ec));
526 } else
527 pr_emerg(HW_ERR "Huh? Unknown MCE error 0x%x\n", ec);
528}
529
530/*
531 * Filter out unwanted MCE signatures here.
532 */
533static bool amd_filter_mce(struct mce *m)
534{
535 u8 xec = (m->status >> 16) & 0x1f;
536
537 /*
538 * NB GART TLB error reporting is disabled by default.
539 */
540 if (m->bank == 4 && xec == 0x5 && !report_gart_errors)
541 return true;
542
543 return false;
544}
545
546int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data)
547{
548 struct mce *m = (struct mce *)data;
549 int node, ecc;
550
551 if (amd_filter_mce(m))
552 return NOTIFY_STOP;
553
554 pr_emerg(HW_ERR "MC%d_STATUS: ", m->bank);
555
556 pr_cont("%sorrected error, other errors lost: %s, "
557 "CPU context corrupt: %s",
558 ((m->status & MCI_STATUS_UC) ? "Unc" : "C"),
559 ((m->status & MCI_STATUS_OVER) ? "yes" : "no"),
560 ((m->status & MCI_STATUS_PCC) ? "yes" : "no"));
561
562 /* do the two bits[14:13] together */
563 ecc = (m->status >> 45) & 0x3;
564 if (ecc)
565 pr_cont(", %sECC Error", ((ecc == 2) ? "C" : "U"));
566
567 pr_cont("\n");
568
569 switch (m->bank) {
570 case 0:
571 amd_decode_dc_mce(m);
572 break;
573
574 case 1:
575 amd_decode_ic_mce(m);
576 break;
577
578 case 2:
579 amd_decode_bu_mce(m);
580 break;
581
582 case 3:
583 amd_decode_ls_mce(m);
584 break;
585
586 case 4:
587 node = amd_get_nb_id(m->extcpu);
588 amd_decode_nb_mce(node, m, 0);
589 break;
590
591 case 5:
592 amd_decode_fr_mce(m);
593 break;
594
595 default:
596 break;
597 }
598
599 amd_decode_err_code(m->status & 0xffff);
600
601 return NOTIFY_STOP;
602}
603EXPORT_SYMBOL_GPL(amd_decode_mce);
604
605static struct notifier_block amd_mce_dec_nb = {
606 .notifier_call = amd_decode_mce,
607};
608
609static int __init mce_amd_init(void)
610{
611 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
612 return 0;
613
614 if ((boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x12) &&
615 (boot_cpu_data.x86 != 0x14 || boot_cpu_data.x86_model > 0xf))
616 return 0;
617
618 fam_ops = kzalloc(sizeof(struct amd_decoder_ops), GFP_KERNEL);
619 if (!fam_ops)
620 return -ENOMEM;
621
622 switch (boot_cpu_data.x86) {
623 case 0xf:
624 fam_ops->dc_mce = k8_dc_mce;
625 fam_ops->ic_mce = k8_ic_mce;
626 fam_ops->nb_mce = k8_nb_mce;
627 break;
628
629 case 0x10:
630 fam_ops->dc_mce = f10h_dc_mce;
631 fam_ops->ic_mce = k8_ic_mce;
632 fam_ops->nb_mce = f10h_nb_mce;
633 break;
634
635 case 0x11:
636 fam_ops->dc_mce = k8_dc_mce;
637 fam_ops->ic_mce = k8_ic_mce;
638 fam_ops->nb_mce = f10h_nb_mce;
639 break;
640
641 case 0x12:
642 fam_ops->dc_mce = f12h_dc_mce;
643 fam_ops->ic_mce = k8_ic_mce;
644 fam_ops->nb_mce = nb_noop_mce;
645 break;
646
647 case 0x14:
648 nb_err_cpumask = 0x3;
649 fam_ops->dc_mce = f14h_dc_mce;
650 fam_ops->ic_mce = f14h_ic_mce;
651 fam_ops->nb_mce = nb_noop_mce;
652 break;
653
654 default:
655 printk(KERN_WARNING "Huh? What family is that: %d?!\n",
656 boot_cpu_data.x86);
657 kfree(fam_ops);
658 return -EINVAL;
659 }
660
661 pr_info("MCE: In-kernel MCE decoding enabled.\n");
662
663 atomic_notifier_chain_register(&x86_mce_decoder_chain, &amd_mce_dec_nb);
664
665 return 0;
666}
667early_initcall(mce_amd_init);
668
669#ifdef MODULE
670static void __exit mce_amd_exit(void)
671{
672 atomic_notifier_chain_unregister(&x86_mce_decoder_chain, &amd_mce_dec_nb);
673 kfree(fam_ops);
674}
675
676MODULE_DESCRIPTION("AMD MCE decoder");
677MODULE_ALIAS("edac-mce-amd");
678MODULE_LICENSE("GPL");
679module_exit(mce_amd_exit);
680#endif
diff --git a/drivers/edac/edac_mce_amd.h b/drivers/edac/mce_amd.h
index df23ee065f79..35f6e0e3b297 100644
--- a/drivers/edac/edac_mce_amd.h
+++ b/drivers/edac/mce_amd.h
@@ -1,11 +1,14 @@
1#ifndef _EDAC_MCE_AMD_H 1#ifndef _EDAC_MCE_AMD_H
2#define _EDAC_MCE_AMD_H 2#define _EDAC_MCE_AMD_H
3 3
4#include <linux/notifier.h>
5
4#include <asm/mce.h> 6#include <asm/mce.h>
5 7
8#define BIT_64(n) (U64_C(1) << (n))
9
6#define ERROR_CODE(x) ((x) & 0xffff) 10#define ERROR_CODE(x) ((x) & 0xffff)
7#define EXT_ERROR_CODE(x) (((x) >> 16) & 0x1f) 11#define EXT_ERROR_CODE(x) (((x) >> 16) & 0x1f)
8#define EXT_ERR_MSG(x) ext_msgs[EXT_ERROR_CODE(x)]
9 12
10#define LOW_SYNDROME(x) (((x) >> 15) & 0xff) 13#define LOW_SYNDROME(x) (((x) >> 15) & 0xff)
11#define HIGH_SYNDROME(x) (((x) >> 24) & 0xff) 14#define HIGH_SYNDROME(x) (((x) >> 24) & 0xff)
@@ -20,13 +23,14 @@
20#define II_MSG(x) ii_msgs[II(x)] 23#define II_MSG(x) ii_msgs[II(x)]
21#define LL(x) (((x) >> 0) & 0x3) 24#define LL(x) (((x) >> 0) & 0x3)
22#define LL_MSG(x) ll_msgs[LL(x)] 25#define LL_MSG(x) ll_msgs[LL(x)]
23#define RRRR(x) (((x) >> 4) & 0xf)
24#define RRRR_MSG(x) rrrr_msgs[RRRR(x)]
25#define TO(x) (((x) >> 8) & 0x1) 26#define TO(x) (((x) >> 8) & 0x1)
26#define TO_MSG(x) to_msgs[TO(x)] 27#define TO_MSG(x) to_msgs[TO(x)]
27#define PP(x) (((x) >> 9) & 0x3) 28#define PP(x) (((x) >> 9) & 0x3)
28#define PP_MSG(x) pp_msgs[PP(x)] 29#define PP_MSG(x) pp_msgs[PP(x)]
29 30
31#define RRRR(x) (((x) >> 4) & 0xf)
32#define RRRR_MSG(x) ((RRRR(x) < 9) ? rrrr_msgs[RRRR(x)] : "Wrong R4!")
33
30#define K8_NBSH 0x4C 34#define K8_NBSH 0x4C
31 35
32#define K8_NBSH_VALID_BIT BIT(31) 36#define K8_NBSH_VALID_BIT BIT(31)
@@ -41,13 +45,45 @@
41#define K8_NBSH_UECC BIT(13) 45#define K8_NBSH_UECC BIT(13)
42#define K8_NBSH_ERR_SCRUBER BIT(8) 46#define K8_NBSH_ERR_SCRUBER BIT(8)
43 47
48enum tt_ids {
49 TT_INSTR = 0,
50 TT_DATA,
51 TT_GEN,
52 TT_RESV,
53};
54
55enum ll_ids {
56 LL_RESV = 0,
57 LL_L1,
58 LL_L2,
59 LL_LG,
60};
61
62enum ii_ids {
63 II_MEM = 0,
64 II_RESV,
65 II_IO,
66 II_GEN,
67};
68
69enum rrrr_ids {
70 R4_GEN = 0,
71 R4_RD,
72 R4_WR,
73 R4_DRD,
74 R4_DWR,
75 R4_IRD,
76 R4_PREF,
77 R4_EVICT,
78 R4_SNOOP,
79};
80
44extern const char *tt_msgs[]; 81extern const char *tt_msgs[];
45extern const char *ll_msgs[]; 82extern const char *ll_msgs[];
46extern const char *rrrr_msgs[]; 83extern const char *rrrr_msgs[];
47extern const char *pp_msgs[]; 84extern const char *pp_msgs[];
48extern const char *to_msgs[]; 85extern const char *to_msgs[];
49extern const char *ii_msgs[]; 86extern const char *ii_msgs[];
50extern const char *ext_msgs[];
51 87
52/* 88/*
53 * relevant NB regs 89 * relevant NB regs
@@ -60,10 +96,19 @@ struct err_regs {
60 u32 nbeal; 96 u32 nbeal;
61}; 97};
62 98
99/*
100 * per-family decoder ops
101 */
102struct amd_decoder_ops {
103 bool (*dc_mce)(u16);
104 bool (*ic_mce)(u16);
105 bool (*nb_mce)(u16, u8);
106};
63 107
64void amd_report_gart_errors(bool); 108void amd_report_gart_errors(bool);
65void amd_register_ecc_decoder(void (*f)(int, struct err_regs *)); 109void amd_register_ecc_decoder(void (*f)(int, struct mce *, u32));
66void amd_unregister_ecc_decoder(void (*f)(int, struct err_regs *)); 110void amd_unregister_ecc_decoder(void (*f)(int, struct mce *, u32));
67void amd_decode_nb_mce(int, struct err_regs *, int); 111void amd_decode_nb_mce(int, struct mce *, u32);
112int amd_decode_mce(struct notifier_block *nb, unsigned long val, void *data);
68 113
69#endif /* _EDAC_MCE_AMD_H */ 114#endif /* _EDAC_MCE_AMD_H */
diff --git a/drivers/edac/mce_amd_inj.c b/drivers/edac/mce_amd_inj.c
new file mode 100644
index 000000000000..8d0688f36d4c
--- /dev/null
+++ b/drivers/edac/mce_amd_inj.c
@@ -0,0 +1,171 @@
1/*
2 * A simple MCE injection facility for testing the MCE decoding code. This
3 * driver should be built as module so that it can be loaded on production
4 * kernels for testing purposes.
5 *
6 * This file may be distributed under the terms of the GNU General Public
7 * License version 2.
8 *
9 * Copyright (c) 2010: Borislav Petkov <borislav.petkov@amd.com>
10 * Advanced Micro Devices Inc.
11 */
12
13#include <linux/kobject.h>
14#include <linux/sysdev.h>
15#include <linux/edac.h>
16#include <asm/mce.h>
17
18#include "mce_amd.h"
19
20struct edac_mce_attr {
21 struct attribute attr;
22 ssize_t (*show) (struct kobject *kobj, struct edac_mce_attr *attr, char *buf);
23 ssize_t (*store)(struct kobject *kobj, struct edac_mce_attr *attr,
24 const char *buf, size_t count);
25};
26
27#define EDAC_MCE_ATTR(_name, _mode, _show, _store) \
28static struct edac_mce_attr mce_attr_##_name = __ATTR(_name, _mode, _show, _store)
29
30static struct kobject *mce_kobj;
31
32/*
33 * Collect all the MCi_XXX settings
34 */
35static struct mce i_mce;
36
37#define MCE_INJECT_STORE(reg) \
38static ssize_t edac_inject_##reg##_store(struct kobject *kobj, \
39 struct edac_mce_attr *attr, \
40 const char *data, size_t count)\
41{ \
42 int ret = 0; \
43 unsigned long value; \
44 \
45 ret = strict_strtoul(data, 16, &value); \
46 if (ret < 0) \
47 printk(KERN_ERR "Error writing MCE " #reg " field.\n"); \
48 \
49 i_mce.reg = value; \
50 \
51 return count; \
52}
53
54MCE_INJECT_STORE(status);
55MCE_INJECT_STORE(misc);
56MCE_INJECT_STORE(addr);
57
58#define MCE_INJECT_SHOW(reg) \
59static ssize_t edac_inject_##reg##_show(struct kobject *kobj, \
60 struct edac_mce_attr *attr, \
61 char *buf) \
62{ \
63 return sprintf(buf, "0x%016llx\n", i_mce.reg); \
64}
65
66MCE_INJECT_SHOW(status);
67MCE_INJECT_SHOW(misc);
68MCE_INJECT_SHOW(addr);
69
70EDAC_MCE_ATTR(status, 0644, edac_inject_status_show, edac_inject_status_store);
71EDAC_MCE_ATTR(misc, 0644, edac_inject_misc_show, edac_inject_misc_store);
72EDAC_MCE_ATTR(addr, 0644, edac_inject_addr_show, edac_inject_addr_store);
73
74/*
75 * This denotes into which bank we're injecting and triggers
76 * the injection, at the same time.
77 */
78static ssize_t edac_inject_bank_store(struct kobject *kobj,
79 struct edac_mce_attr *attr,
80 const char *data, size_t count)
81{
82 int ret = 0;
83 unsigned long value;
84
85 ret = strict_strtoul(data, 10, &value);
86 if (ret < 0) {
87 printk(KERN_ERR "Invalid bank value!\n");
88 return -EINVAL;
89 }
90
91 if (value > 5) {
92 printk(KERN_ERR "Non-existant MCE bank: %lu\n", value);
93 return -EINVAL;
94 }
95
96 i_mce.bank = value;
97
98 amd_decode_mce(NULL, 0, &i_mce);
99
100 return count;
101}
102
103static ssize_t edac_inject_bank_show(struct kobject *kobj,
104 struct edac_mce_attr *attr, char *buf)
105{
106 return sprintf(buf, "%d\n", i_mce.bank);
107}
108
109EDAC_MCE_ATTR(bank, 0644, edac_inject_bank_show, edac_inject_bank_store);
110
111static struct edac_mce_attr *sysfs_attrs[] = { &mce_attr_status, &mce_attr_misc,
112 &mce_attr_addr, &mce_attr_bank
113};
114
115static int __init edac_init_mce_inject(void)
116{
117 struct sysdev_class *edac_class = NULL;
118 int i, err = 0;
119
120 edac_class = edac_get_sysfs_class();
121 if (!edac_class)
122 return -EINVAL;
123
124 mce_kobj = kobject_create_and_add("mce", &edac_class->kset.kobj);
125 if (!mce_kobj) {
126 printk(KERN_ERR "Error creating a mce kset.\n");
127 err = -ENOMEM;
128 goto err_mce_kobj;
129 }
130
131 for (i = 0; i < ARRAY_SIZE(sysfs_attrs); i++) {
132 err = sysfs_create_file(mce_kobj, &sysfs_attrs[i]->attr);
133 if (err) {
134 printk(KERN_ERR "Error creating %s in sysfs.\n",
135 sysfs_attrs[i]->attr.name);
136 goto err_sysfs_create;
137 }
138 }
139 return 0;
140
141err_sysfs_create:
142 while (i-- >= 0)
143 sysfs_remove_file(mce_kobj, &sysfs_attrs[i]->attr);
144
145 kobject_del(mce_kobj);
146
147err_mce_kobj:
148 edac_put_sysfs_class();
149
150 return err;
151}
152
153static void __exit edac_exit_mce_inject(void)
154{
155 int i;
156
157 for (i = 0; i < ARRAY_SIZE(sysfs_attrs); i++)
158 sysfs_remove_file(mce_kobj, &sysfs_attrs[i]->attr);
159
160 kobject_del(mce_kobj);
161
162 edac_put_sysfs_class();
163}
164
165module_init(edac_init_mce_inject);
166module_exit(edac_exit_mce_inject);
167
168MODULE_LICENSE("GPL");
169MODULE_AUTHOR("Borislav Petkov <borislav.petkov@amd.com>");
170MODULE_AUTHOR("AMD Inc.");
171MODULE_DESCRIPTION("MCE injection facility for testing MCE decoding");
diff --git a/drivers/firmware/Kconfig b/drivers/firmware/Kconfig
index 280c9b5ad9e3..88a3ae6cd023 100644
--- a/drivers/firmware/Kconfig
+++ b/drivers/firmware/Kconfig
@@ -125,7 +125,7 @@ config ISCSI_IBFT_FIND
125config ISCSI_IBFT 125config ISCSI_IBFT
126 tristate "iSCSI Boot Firmware Table Attributes module" 126 tristate "iSCSI Boot Firmware Table Attributes module"
127 select ISCSI_BOOT_SYSFS 127 select ISCSI_BOOT_SYSFS
128 depends on ISCSI_IBFT_FIND && SCSI 128 depends on ISCSI_IBFT_FIND && SCSI && SCSI_LOWLEVEL
129 default n 129 default n
130 help 130 help
131 This option enables support for detection and exposing of iSCSI 131 This option enables support for detection and exposing of iSCSI
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index c37ef64d1465..cb3ccf3ed221 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -59,18 +59,11 @@
59#include <linux/hrtimer.h> /* ktime_get_real() */ 59#include <linux/hrtimer.h> /* ktime_get_real() */
60#include <trace/events/power.h> 60#include <trace/events/power.h>
61#include <linux/sched.h> 61#include <linux/sched.h>
62#include <asm/mwait.h>
62 63
63#define INTEL_IDLE_VERSION "0.4" 64#define INTEL_IDLE_VERSION "0.4"
64#define PREFIX "intel_idle: " 65#define PREFIX "intel_idle: "
65 66
66#define MWAIT_SUBSTATE_MASK (0xf)
67#define MWAIT_CSTATE_MASK (0xf)
68#define MWAIT_SUBSTATE_SIZE (4)
69#define MWAIT_MAX_NUM_CSTATES 8
70#define CPUID_MWAIT_LEAF (5)
71#define CPUID5_ECX_EXTENSIONS_SUPPORTED (0x1)
72#define CPUID5_ECX_INTERRUPT_BREAK (0x2)
73
74static struct cpuidle_driver intel_idle_driver = { 67static struct cpuidle_driver intel_idle_driver = {
75 .name = "intel_idle", 68 .name = "intel_idle",
76 .owner = THIS_MODULE, 69 .owner = THIS_MODULE,
diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c
index 9ddafc30f432..af9ee313c10b 100644
--- a/drivers/input/evdev.c
+++ b/drivers/input/evdev.c
@@ -28,7 +28,7 @@ struct evdev {
28 int minor; 28 int minor;
29 struct input_handle handle; 29 struct input_handle handle;
30 wait_queue_head_t wait; 30 wait_queue_head_t wait;
31 struct evdev_client *grab; 31 struct evdev_client __rcu *grab;
32 struct list_head client_list; 32 struct list_head client_list;
33 spinlock_t client_lock; /* protects client_list */ 33 spinlock_t client_lock; /* protects client_list */
34 struct mutex mutex; 34 struct mutex mutex;
diff --git a/drivers/input/misc/hp_sdc_rtc.c b/drivers/input/misc/hp_sdc_rtc.c
index c19066479057..7e2c12a5b839 100644
--- a/drivers/input/misc/hp_sdc_rtc.c
+++ b/drivers/input/misc/hp_sdc_rtc.c
@@ -104,7 +104,7 @@ static int hp_sdc_rtc_do_read_bbrtc (struct rtc_time *rtctm)
104 t.endidx = 91; 104 t.endidx = 91;
105 t.seq = tseq; 105 t.seq = tseq;
106 t.act.semaphore = &tsem; 106 t.act.semaphore = &tsem;
107 init_MUTEX_LOCKED(&tsem); 107 sema_init(&tsem, 0);
108 108
109 if (hp_sdc_enqueue_transaction(&t)) return -1; 109 if (hp_sdc_enqueue_transaction(&t)) return -1;
110 110
@@ -698,7 +698,7 @@ static int __init hp_sdc_rtc_init(void)
698 return -ENODEV; 698 return -ENODEV;
699#endif 699#endif
700 700
701 init_MUTEX(&i8042tregs); 701 sema_init(&i8042tregs, 1);
702 702
703 if ((ret = hp_sdc_request_timer_irq(&hp_sdc_rtc_isr))) 703 if ((ret = hp_sdc_request_timer_irq(&hp_sdc_rtc_isr)))
704 return ret; 704 return ret;
diff --git a/drivers/input/serio/hil_mlc.c b/drivers/input/serio/hil_mlc.c
index c92f4edfee7b..e5624d8f1709 100644
--- a/drivers/input/serio/hil_mlc.c
+++ b/drivers/input/serio/hil_mlc.c
@@ -915,15 +915,15 @@ int hil_mlc_register(hil_mlc *mlc)
915 mlc->ostarted = 0; 915 mlc->ostarted = 0;
916 916
917 rwlock_init(&mlc->lock); 917 rwlock_init(&mlc->lock);
918 init_MUTEX(&mlc->osem); 918 sema_init(&mlc->osem, 1);
919 919
920 init_MUTEX(&mlc->isem); 920 sema_init(&mlc->isem, 1);
921 mlc->icount = -1; 921 mlc->icount = -1;
922 mlc->imatch = 0; 922 mlc->imatch = 0;
923 923
924 mlc->opercnt = 0; 924 mlc->opercnt = 0;
925 925
926 init_MUTEX_LOCKED(&(mlc->csem)); 926 sema_init(&(mlc->csem), 0);
927 927
928 hil_mlc_clear_di_scratch(mlc); 928 hil_mlc_clear_di_scratch(mlc);
929 hil_mlc_clear_di_map(mlc, 0); 929 hil_mlc_clear_di_map(mlc, 0);
diff --git a/drivers/input/serio/hp_sdc.c b/drivers/input/serio/hp_sdc.c
index bcc2d30ec245..8c0b51c31424 100644
--- a/drivers/input/serio/hp_sdc.c
+++ b/drivers/input/serio/hp_sdc.c
@@ -905,7 +905,7 @@ static int __init hp_sdc_init(void)
905 ts_sync[1] = 0x0f; 905 ts_sync[1] = 0x0f;
906 ts_sync[2] = ts_sync[3] = ts_sync[4] = ts_sync[5] = 0; 906 ts_sync[2] = ts_sync[3] = ts_sync[4] = ts_sync[5] = 0;
907 t_sync.act.semaphore = &s_sync; 907 t_sync.act.semaphore = &s_sync;
908 init_MUTEX_LOCKED(&s_sync); 908 sema_init(&s_sync, 0);
909 hp_sdc_enqueue_transaction(&t_sync); 909 hp_sdc_enqueue_transaction(&t_sync);
910 down(&s_sync); /* Wait for t_sync to complete */ 910 down(&s_sync); /* Wait for t_sync to complete */
911 911
@@ -1039,7 +1039,7 @@ static int __init hp_sdc_register(void)
1039 return hp_sdc.dev_err; 1039 return hp_sdc.dev_err;
1040 } 1040 }
1041 1041
1042 init_MUTEX_LOCKED(&tq_init_sem); 1042 sema_init(&tq_init_sem, 0);
1043 1043
1044 tq_init.actidx = 0; 1044 tq_init.actidx = 0;
1045 tq_init.idx = 1; 1045 tq_init.idx = 1;
diff --git a/drivers/isdn/act2000/act2000.h b/drivers/isdn/act2000/act2000.h
index d4c50512a1ff..88c9423500d8 100644
--- a/drivers/isdn/act2000/act2000.h
+++ b/drivers/isdn/act2000/act2000.h
@@ -141,9 +141,9 @@ typedef struct irq_data_isa {
141 __u8 rcvhdr[8]; 141 __u8 rcvhdr[8];
142} irq_data_isa; 142} irq_data_isa;
143 143
144typedef union irq_data { 144typedef union act2000_irq_data {
145 irq_data_isa isa; 145 irq_data_isa isa;
146} irq_data; 146} act2000_irq_data;
147 147
148/* 148/*
149 * Per card driver data 149 * Per card driver data
@@ -176,7 +176,7 @@ typedef struct act2000_card {
176 char *status_buf_read; 176 char *status_buf_read;
177 char *status_buf_write; 177 char *status_buf_write;
178 char *status_buf_end; 178 char *status_buf_end;
179 irq_data idat; /* Data used for IRQ handler */ 179 act2000_irq_data idat; /* Data used for IRQ handler */
180 isdn_if interface; /* Interface to upper layer */ 180 isdn_if interface; /* Interface to upper layer */
181 char regname[35]; /* Name used for request_region */ 181 char regname[35]; /* Name used for request_region */
182} act2000_card; 182} act2000_card;
diff --git a/drivers/isdn/hisax/config.c b/drivers/isdn/hisax/config.c
index 6f9afcd5ca4e..b133378d4dc9 100644
--- a/drivers/isdn/hisax/config.c
+++ b/drivers/isdn/hisax/config.c
@@ -801,6 +801,16 @@ static void closecard(int cardnr)
801 ll_unload(csta); 801 ll_unload(csta);
802} 802}
803 803
804static irqreturn_t card_irq(int intno, void *dev_id)
805{
806 struct IsdnCardState *cs = dev_id;
807 irqreturn_t ret = cs->irq_func(intno, cs);
808
809 if (ret == IRQ_HANDLED)
810 cs->irq_cnt++;
811 return ret;
812}
813
804static int init_card(struct IsdnCardState *cs) 814static int init_card(struct IsdnCardState *cs)
805{ 815{
806 int irq_cnt, cnt = 3, ret; 816 int irq_cnt, cnt = 3, ret;
@@ -809,10 +819,10 @@ static int init_card(struct IsdnCardState *cs)
809 ret = cs->cardmsg(cs, CARD_INIT, NULL); 819 ret = cs->cardmsg(cs, CARD_INIT, NULL);
810 return(ret); 820 return(ret);
811 } 821 }
812 irq_cnt = kstat_irqs(cs->irq); 822 irq_cnt = cs->irq_cnt = 0;
813 printk(KERN_INFO "%s: IRQ %d count %d\n", CardType[cs->typ], 823 printk(KERN_INFO "%s: IRQ %d count %d\n", CardType[cs->typ],
814 cs->irq, irq_cnt); 824 cs->irq, irq_cnt);
815 if (request_irq(cs->irq, cs->irq_func, cs->irq_flags, "HiSax", cs)) { 825 if (request_irq(cs->irq, card_irq, cs->irq_flags, "HiSax", cs)) {
816 printk(KERN_WARNING "HiSax: couldn't get interrupt %d\n", 826 printk(KERN_WARNING "HiSax: couldn't get interrupt %d\n",
817 cs->irq); 827 cs->irq);
818 return 1; 828 return 1;
@@ -822,8 +832,8 @@ static int init_card(struct IsdnCardState *cs)
822 /* Timeout 10ms */ 832 /* Timeout 10ms */
823 msleep(10); 833 msleep(10);
824 printk(KERN_INFO "%s: IRQ %d count %d\n", 834 printk(KERN_INFO "%s: IRQ %d count %d\n",
825 CardType[cs->typ], cs->irq, kstat_irqs(cs->irq)); 835 CardType[cs->typ], cs->irq, cs->irq_cnt);
826 if (kstat_irqs(cs->irq) == irq_cnt) { 836 if (cs->irq_cnt == irq_cnt) {
827 printk(KERN_WARNING 837 printk(KERN_WARNING
828 "%s: IRQ(%d) getting no interrupts during init %d\n", 838 "%s: IRQ(%d) getting no interrupts during init %d\n",
829 CardType[cs->typ], cs->irq, 4 - cnt); 839 CardType[cs->typ], cs->irq, 4 - cnt);
diff --git a/drivers/isdn/hisax/hisax.h b/drivers/isdn/hisax/hisax.h
index 832a87855ffb..32ab3924aa73 100644
--- a/drivers/isdn/hisax/hisax.h
+++ b/drivers/isdn/hisax/hisax.h
@@ -959,6 +959,7 @@ struct IsdnCardState {
959 u_long event; 959 u_long event;
960 struct work_struct tqueue; 960 struct work_struct tqueue;
961 struct timer_list dbusytimer; 961 struct timer_list dbusytimer;
962 unsigned int irq_cnt;
962#ifdef ERROR_STATISTIC 963#ifdef ERROR_STATISTIC
963 int err_crc; 964 int err_crc;
964 int err_tx; 965 int err_tx;
diff --git a/drivers/macintosh/adb.c b/drivers/macintosh/adb.c
index 1c4ee6e77937..bf64e49d996a 100644
--- a/drivers/macintosh/adb.c
+++ b/drivers/macintosh/adb.c
@@ -83,7 +83,7 @@ static struct adb_driver *adb_controller;
83BLOCKING_NOTIFIER_HEAD(adb_client_list); 83BLOCKING_NOTIFIER_HEAD(adb_client_list);
84static int adb_got_sleep; 84static int adb_got_sleep;
85static int adb_inited; 85static int adb_inited;
86static DECLARE_MUTEX(adb_probe_mutex); 86static DEFINE_SEMAPHORE(adb_probe_mutex);
87static int sleepy_trackpad; 87static int sleepy_trackpad;
88static int autopoll_devs; 88static int autopoll_devs;
89int __adb_probe_sync; 89int __adb_probe_sync;
diff --git a/drivers/mfd/twl4030-irq.c b/drivers/mfd/twl4030-irq.c
index 097f24d8bceb..b9fda7018cef 100644
--- a/drivers/mfd/twl4030-irq.c
+++ b/drivers/mfd/twl4030-irq.c
@@ -78,7 +78,7 @@ struct sih {
78 u8 irq_lines; /* number of supported irq lines */ 78 u8 irq_lines; /* number of supported irq lines */
79 79
80 /* SIR ignored -- set interrupt, for testing only */ 80 /* SIR ignored -- set interrupt, for testing only */
81 struct irq_data { 81 struct sih_irq_data {
82 u8 isr_offset; 82 u8 isr_offset;
83 u8 imr_offset; 83 u8 imr_offset;
84 } mask[2]; 84 } mask[2];
@@ -810,7 +810,7 @@ int twl4030_init_irq(int irq_num, unsigned irq_base, unsigned irq_end)
810 twl4030_irq_chip = dummy_irq_chip; 810 twl4030_irq_chip = dummy_irq_chip;
811 twl4030_irq_chip.name = "twl4030"; 811 twl4030_irq_chip.name = "twl4030";
812 812
813 twl4030_sih_irq_chip.ack = dummy_irq_chip.ack; 813 twl4030_sih_irq_chip.irq_ack = dummy_irq_chip.irq_ack;
814 814
815 for (i = irq_base; i < irq_end; i++) { 815 for (i = irq_base; i < irq_end; i++) {
816 set_irq_chip_and_handler(i, &twl4030_irq_chip, 816 set_irq_chip_and_handler(i, &twl4030_irq_chip,
diff --git a/drivers/net/3c527.c b/drivers/net/3c527.c
index 70705d1306b9..eca55c52bdfd 100644
--- a/drivers/net/3c527.c
+++ b/drivers/net/3c527.c
@@ -522,7 +522,7 @@ static int __init mc32_probe1(struct net_device *dev, int slot)
522 lp->tx_len = lp->exec_box->data[9]; /* Transmit list count */ 522 lp->tx_len = lp->exec_box->data[9]; /* Transmit list count */
523 lp->rx_len = lp->exec_box->data[11]; /* Receive list count */ 523 lp->rx_len = lp->exec_box->data[11]; /* Receive list count */
524 524
525 init_MUTEX_LOCKED(&lp->cmd_mutex); 525 sema_init(&lp->cmd_mutex, 0);
526 init_completion(&lp->execution_cmd); 526 init_completion(&lp->execution_cmd);
527 init_completion(&lp->xceiver_cmd); 527 init_completion(&lp->xceiver_cmd);
528 528
diff --git a/drivers/net/hamradio/6pack.c b/drivers/net/hamradio/6pack.c
index 4b52c767ad05..3e5d0b6b6516 100644
--- a/drivers/net/hamradio/6pack.c
+++ b/drivers/net/hamradio/6pack.c
@@ -608,7 +608,7 @@ static int sixpack_open(struct tty_struct *tty)
608 608
609 spin_lock_init(&sp->lock); 609 spin_lock_init(&sp->lock);
610 atomic_set(&sp->refcnt, 1); 610 atomic_set(&sp->refcnt, 1);
611 init_MUTEX_LOCKED(&sp->dead_sem); 611 sema_init(&sp->dead_sem, 0);
612 612
613 /* !!! length of the buffers. MTU is IP MTU, not PACLEN! */ 613 /* !!! length of the buffers. MTU is IP MTU, not PACLEN! */
614 614
diff --git a/drivers/net/hamradio/mkiss.c b/drivers/net/hamradio/mkiss.c
index 66e88bd59caa..4c628393c8b1 100644
--- a/drivers/net/hamradio/mkiss.c
+++ b/drivers/net/hamradio/mkiss.c
@@ -747,7 +747,7 @@ static int mkiss_open(struct tty_struct *tty)
747 747
748 spin_lock_init(&ax->buflock); 748 spin_lock_init(&ax->buflock);
749 atomic_set(&ax->refcnt, 1); 749 atomic_set(&ax->refcnt, 1);
750 init_MUTEX_LOCKED(&ax->dead_sem); 750 sema_init(&ax->dead_sem, 0);
751 751
752 ax->tty = tty; 752 ax->tty = tty;
753 tty->disc_data = ax; 753 tty->disc_data = ax;
diff --git a/drivers/net/irda/sir_dev.c b/drivers/net/irda/sir_dev.c
index 1b051dab7b29..51d74447f8f8 100644
--- a/drivers/net/irda/sir_dev.c
+++ b/drivers/net/irda/sir_dev.c
@@ -909,7 +909,7 @@ struct sir_dev * sirdev_get_instance(const struct sir_driver *drv, const char *n
909 dev->tx_skb = NULL; 909 dev->tx_skb = NULL;
910 910
911 spin_lock_init(&dev->tx_lock); 911 spin_lock_init(&dev->tx_lock);
912 init_MUTEX(&dev->fsm.sem); 912 sema_init(&dev->fsm.sem, 1);
913 913
914 dev->drv = drv; 914 dev->drv = drv;
915 dev->netdev = ndev; 915 dev->netdev = ndev;
diff --git a/drivers/net/ppp_async.c b/drivers/net/ppp_async.c
index af50a530daee..78d70a6481bf 100644
--- a/drivers/net/ppp_async.c
+++ b/drivers/net/ppp_async.c
@@ -184,7 +184,7 @@ ppp_asynctty_open(struct tty_struct *tty)
184 tasklet_init(&ap->tsk, ppp_async_process, (unsigned long) ap); 184 tasklet_init(&ap->tsk, ppp_async_process, (unsigned long) ap);
185 185
186 atomic_set(&ap->refcnt, 1); 186 atomic_set(&ap->refcnt, 1);
187 init_MUTEX_LOCKED(&ap->dead_sem); 187 sema_init(&ap->dead_sem, 0);
188 188
189 ap->chan.private = ap; 189 ap->chan.private = ap;
190 ap->chan.ops = &async_ops; 190 ap->chan.ops = &async_ops;
diff --git a/drivers/net/wan/cosa.c b/drivers/net/wan/cosa.c
index 04c6cd4333f1..10bafd59f9c3 100644
--- a/drivers/net/wan/cosa.c
+++ b/drivers/net/wan/cosa.c
@@ -575,7 +575,7 @@ static int cosa_probe(int base, int irq, int dma)
575 575
576 /* Initialize the chardev data structures */ 576 /* Initialize the chardev data structures */
577 mutex_init(&chan->rlock); 577 mutex_init(&chan->rlock);
578 init_MUTEX(&chan->wsem); 578 sema_init(&chan->wsem, 1);
579 579
580 /* Register the network interface */ 580 /* Register the network interface */
581 if (!(chan->netdev = alloc_hdlcdev(chan))) { 581 if (!(chan->netdev = alloc_hdlcdev(chan))) {
diff --git a/drivers/parport/share.c b/drivers/parport/share.c
index dffa5d4fb298..a2d9d1e59260 100644
--- a/drivers/parport/share.c
+++ b/drivers/parport/share.c
@@ -306,7 +306,7 @@ struct parport *parport_register_port(unsigned long base, int irq, int dma,
306 spin_lock_init(&tmp->pardevice_lock); 306 spin_lock_init(&tmp->pardevice_lock);
307 tmp->ieee1284.mode = IEEE1284_MODE_COMPAT; 307 tmp->ieee1284.mode = IEEE1284_MODE_COMPAT;
308 tmp->ieee1284.phase = IEEE1284_PH_FWD_IDLE; 308 tmp->ieee1284.phase = IEEE1284_PH_FWD_IDLE;
309 init_MUTEX_LOCKED (&tmp->ieee1284.irq); /* actually a semaphore at 0 */ 309 sema_init(&tmp->ieee1284.irq, 0);
310 tmp->spintime = parport_default_spintime; 310 tmp->spintime = parport_default_spintime;
311 atomic_set (&tmp->ref_count, 1); 311 atomic_set (&tmp->ref_count, 1);
312 INIT_LIST_HEAD(&tmp->full_list); 312 INIT_LIST_HEAD(&tmp->full_list);
diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c
index 0a19708074c2..3de3a436a432 100644
--- a/drivers/pci/dmar.c
+++ b/drivers/pci/dmar.c
@@ -1221,9 +1221,9 @@ const char *dmar_get_fault_reason(u8 fault_reason, int *fault_type)
1221 } 1221 }
1222} 1222}
1223 1223
1224void dmar_msi_unmask(unsigned int irq) 1224void dmar_msi_unmask(struct irq_data *data)
1225{ 1225{
1226 struct intel_iommu *iommu = get_irq_data(irq); 1226 struct intel_iommu *iommu = irq_data_get_irq_data(data);
1227 unsigned long flag; 1227 unsigned long flag;
1228 1228
1229 /* unmask it */ 1229 /* unmask it */
@@ -1234,10 +1234,10 @@ void dmar_msi_unmask(unsigned int irq)
1234 spin_unlock_irqrestore(&iommu->register_lock, flag); 1234 spin_unlock_irqrestore(&iommu->register_lock, flag);
1235} 1235}
1236 1236
1237void dmar_msi_mask(unsigned int irq) 1237void dmar_msi_mask(struct irq_data *data)
1238{ 1238{
1239 unsigned long flag; 1239 unsigned long flag;
1240 struct intel_iommu *iommu = get_irq_data(irq); 1240 struct intel_iommu *iommu = irq_data_get_irq_data(data);
1241 1241
1242 /* mask it */ 1242 /* mask it */
1243 spin_lock_irqsave(&iommu->register_lock, flag); 1243 spin_lock_irqsave(&iommu->register_lock, flag);
diff --git a/drivers/pci/htirq.c b/drivers/pci/htirq.c
index 98abf8b91294..834842aa5bbf 100644
--- a/drivers/pci/htirq.c
+++ b/drivers/pci/htirq.c
@@ -57,28 +57,22 @@ void fetch_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg)
57 *msg = cfg->msg; 57 *msg = cfg->msg;
58} 58}
59 59
60void mask_ht_irq(unsigned int irq) 60void mask_ht_irq(struct irq_data *data)
61{ 61{
62 struct ht_irq_cfg *cfg; 62 struct ht_irq_cfg *cfg = irq_data_get_irq_data(data);
63 struct ht_irq_msg msg; 63 struct ht_irq_msg msg = cfg->msg;
64
65 cfg = get_irq_data(irq);
66 64
67 msg = cfg->msg;
68 msg.address_lo |= 1; 65 msg.address_lo |= 1;
69 write_ht_irq_msg(irq, &msg); 66 write_ht_irq_msg(data->irq, &msg);
70} 67}
71 68
72void unmask_ht_irq(unsigned int irq) 69void unmask_ht_irq(struct irq_data *data)
73{ 70{
74 struct ht_irq_cfg *cfg; 71 struct ht_irq_cfg *cfg = irq_data_get_irq_data(data);
75 struct ht_irq_msg msg; 72 struct ht_irq_msg msg = cfg->msg;
76
77 cfg = get_irq_data(irq);
78 73
79 msg = cfg->msg;
80 msg.address_lo &= ~1; 74 msg.address_lo &= ~1;
81 write_ht_irq_msg(irq, &msg); 75 write_ht_irq_msg(data->irq, &msg);
82} 76}
83 77
84/** 78/**
diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c
index fd1d2867cdcc..ec87cd66f3eb 100644
--- a/drivers/pci/intr_remapping.c
+++ b/drivers/pci/intr_remapping.c
@@ -46,109 +46,24 @@ static __init int setup_intremap(char *str)
46} 46}
47early_param("intremap", setup_intremap); 47early_param("intremap", setup_intremap);
48 48
49struct irq_2_iommu {
50 struct intel_iommu *iommu;
51 u16 irte_index;
52 u16 sub_handle;
53 u8 irte_mask;
54};
55
56#ifdef CONFIG_GENERIC_HARDIRQS
57static struct irq_2_iommu *get_one_free_irq_2_iommu(int node)
58{
59 struct irq_2_iommu *iommu;
60
61 iommu = kzalloc_node(sizeof(*iommu), GFP_ATOMIC, node);
62 printk(KERN_DEBUG "alloc irq_2_iommu on node %d\n", node);
63
64 return iommu;
65}
66
67static struct irq_2_iommu *irq_2_iommu(unsigned int irq)
68{
69 struct irq_desc *desc;
70
71 desc = irq_to_desc(irq);
72
73 if (WARN_ON_ONCE(!desc))
74 return NULL;
75
76 return desc->irq_2_iommu;
77}
78
79static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq)
80{
81 struct irq_desc *desc;
82 struct irq_2_iommu *irq_iommu;
83
84 desc = irq_to_desc(irq);
85 if (!desc) {
86 printk(KERN_INFO "can not get irq_desc for %d\n", irq);
87 return NULL;
88 }
89
90 irq_iommu = desc->irq_2_iommu;
91
92 if (!irq_iommu)
93 desc->irq_2_iommu = get_one_free_irq_2_iommu(irq_node(irq));
94
95 return desc->irq_2_iommu;
96}
97
98#else /* !CONFIG_SPARSE_IRQ */
99
100static struct irq_2_iommu irq_2_iommuX[NR_IRQS];
101
102static struct irq_2_iommu *irq_2_iommu(unsigned int irq)
103{
104 if (irq < nr_irqs)
105 return &irq_2_iommuX[irq];
106
107 return NULL;
108}
109static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq)
110{
111 return irq_2_iommu(irq);
112}
113#endif
114
115static DEFINE_SPINLOCK(irq_2_ir_lock); 49static DEFINE_SPINLOCK(irq_2_ir_lock);
116 50
117static struct irq_2_iommu *valid_irq_2_iommu(unsigned int irq) 51static struct irq_2_iommu *irq_2_iommu(unsigned int irq)
118{
119 struct irq_2_iommu *irq_iommu;
120
121 irq_iommu = irq_2_iommu(irq);
122
123 if (!irq_iommu)
124 return NULL;
125
126 if (!irq_iommu->iommu)
127 return NULL;
128
129 return irq_iommu;
130}
131
132int irq_remapped(int irq)
133{ 52{
134 return valid_irq_2_iommu(irq) != NULL; 53 struct irq_cfg *cfg = get_irq_chip_data(irq);
54 return cfg ? &cfg->irq_2_iommu : NULL;
135} 55}
136 56
137int get_irte(int irq, struct irte *entry) 57int get_irte(int irq, struct irte *entry)
138{ 58{
139 int index; 59 struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
140 struct irq_2_iommu *irq_iommu;
141 unsigned long flags; 60 unsigned long flags;
61 int index;
142 62
143 if (!entry) 63 if (!entry || !irq_iommu)
144 return -1; 64 return -1;
145 65
146 spin_lock_irqsave(&irq_2_ir_lock, flags); 66 spin_lock_irqsave(&irq_2_ir_lock, flags);
147 irq_iommu = valid_irq_2_iommu(irq);
148 if (!irq_iommu) {
149 spin_unlock_irqrestore(&irq_2_ir_lock, flags);
150 return -1;
151 }
152 67
153 index = irq_iommu->irte_index + irq_iommu->sub_handle; 68 index = irq_iommu->irte_index + irq_iommu->sub_handle;
154 *entry = *(irq_iommu->iommu->ir_table->base + index); 69 *entry = *(irq_iommu->iommu->ir_table->base + index);
@@ -160,20 +75,14 @@ int get_irte(int irq, struct irte *entry)
160int alloc_irte(struct intel_iommu *iommu, int irq, u16 count) 75int alloc_irte(struct intel_iommu *iommu, int irq, u16 count)
161{ 76{
162 struct ir_table *table = iommu->ir_table; 77 struct ir_table *table = iommu->ir_table;
163 struct irq_2_iommu *irq_iommu; 78 struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
164 u16 index, start_index; 79 u16 index, start_index;
165 unsigned int mask = 0; 80 unsigned int mask = 0;
166 unsigned long flags; 81 unsigned long flags;
167 int i; 82 int i;
168 83
169 if (!count) 84 if (!count || !irq_iommu)
170 return -1;
171
172#ifndef CONFIG_SPARSE_IRQ
173 /* protect irq_2_iommu_alloc later */
174 if (irq >= nr_irqs)
175 return -1; 85 return -1;
176#endif
177 86
178 /* 87 /*
179 * start the IRTE search from index 0. 88 * start the IRTE search from index 0.
@@ -214,13 +123,6 @@ int alloc_irte(struct intel_iommu *iommu, int irq, u16 count)
214 for (i = index; i < index + count; i++) 123 for (i = index; i < index + count; i++)
215 table->base[i].present = 1; 124 table->base[i].present = 1;
216 125
217 irq_iommu = irq_2_iommu_alloc(irq);
218 if (!irq_iommu) {
219 spin_unlock_irqrestore(&irq_2_ir_lock, flags);
220 printk(KERN_ERR "can't allocate irq_2_iommu\n");
221 return -1;
222 }
223
224 irq_iommu->iommu = iommu; 126 irq_iommu->iommu = iommu;
225 irq_iommu->irte_index = index; 127 irq_iommu->irte_index = index;
226 irq_iommu->sub_handle = 0; 128 irq_iommu->sub_handle = 0;
@@ -244,17 +146,14 @@ static int qi_flush_iec(struct intel_iommu *iommu, int index, int mask)
244 146
245int map_irq_to_irte_handle(int irq, u16 *sub_handle) 147int map_irq_to_irte_handle(int irq, u16 *sub_handle)
246{ 148{
247 int index; 149 struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
248 struct irq_2_iommu *irq_iommu;
249 unsigned long flags; 150 unsigned long flags;
151 int index;
250 152
251 spin_lock_irqsave(&irq_2_ir_lock, flags); 153 if (!irq_iommu)
252 irq_iommu = valid_irq_2_iommu(irq);
253 if (!irq_iommu) {
254 spin_unlock_irqrestore(&irq_2_ir_lock, flags);
255 return -1; 154 return -1;
256 }
257 155
156 spin_lock_irqsave(&irq_2_ir_lock, flags);
258 *sub_handle = irq_iommu->sub_handle; 157 *sub_handle = irq_iommu->sub_handle;
259 index = irq_iommu->irte_index; 158 index = irq_iommu->irte_index;
260 spin_unlock_irqrestore(&irq_2_ir_lock, flags); 159 spin_unlock_irqrestore(&irq_2_ir_lock, flags);
@@ -263,18 +162,13 @@ int map_irq_to_irte_handle(int irq, u16 *sub_handle)
263 162
264int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index, u16 subhandle) 163int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index, u16 subhandle)
265{ 164{
266 struct irq_2_iommu *irq_iommu; 165 struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
267 unsigned long flags; 166 unsigned long flags;
268 167
269 spin_lock_irqsave(&irq_2_ir_lock, flags); 168 if (!irq_iommu)
270
271 irq_iommu = irq_2_iommu_alloc(irq);
272
273 if (!irq_iommu) {
274 spin_unlock_irqrestore(&irq_2_ir_lock, flags);
275 printk(KERN_ERR "can't allocate irq_2_iommu\n");
276 return -1; 169 return -1;
277 } 170
171 spin_lock_irqsave(&irq_2_ir_lock, flags);
278 172
279 irq_iommu->iommu = iommu; 173 irq_iommu->iommu = iommu;
280 irq_iommu->irte_index = index; 174 irq_iommu->irte_index = index;
@@ -286,43 +180,18 @@ int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index, u16 subhandle)
286 return 0; 180 return 0;
287} 181}
288 182
289int clear_irte_irq(int irq, struct intel_iommu *iommu, u16 index)
290{
291 struct irq_2_iommu *irq_iommu;
292 unsigned long flags;
293
294 spin_lock_irqsave(&irq_2_ir_lock, flags);
295 irq_iommu = valid_irq_2_iommu(irq);
296 if (!irq_iommu) {
297 spin_unlock_irqrestore(&irq_2_ir_lock, flags);
298 return -1;
299 }
300
301 irq_iommu->iommu = NULL;
302 irq_iommu->irte_index = 0;
303 irq_iommu->sub_handle = 0;
304 irq_2_iommu(irq)->irte_mask = 0;
305
306 spin_unlock_irqrestore(&irq_2_ir_lock, flags);
307
308 return 0;
309}
310
311int modify_irte(int irq, struct irte *irte_modified) 183int modify_irte(int irq, struct irte *irte_modified)
312{ 184{
313 int rc; 185 struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
314 int index;
315 struct irte *irte;
316 struct intel_iommu *iommu; 186 struct intel_iommu *iommu;
317 struct irq_2_iommu *irq_iommu;
318 unsigned long flags; 187 unsigned long flags;
188 struct irte *irte;
189 int rc, index;
319 190
320 spin_lock_irqsave(&irq_2_ir_lock, flags); 191 if (!irq_iommu)
321 irq_iommu = valid_irq_2_iommu(irq);
322 if (!irq_iommu) {
323 spin_unlock_irqrestore(&irq_2_ir_lock, flags);
324 return -1; 192 return -1;
325 } 193
194 spin_lock_irqsave(&irq_2_ir_lock, flags);
326 195
327 iommu = irq_iommu->iommu; 196 iommu = irq_iommu->iommu;
328 197
@@ -339,31 +208,6 @@ int modify_irte(int irq, struct irte *irte_modified)
339 return rc; 208 return rc;
340} 209}
341 210
342int flush_irte(int irq)
343{
344 int rc;
345 int index;
346 struct intel_iommu *iommu;
347 struct irq_2_iommu *irq_iommu;
348 unsigned long flags;
349
350 spin_lock_irqsave(&irq_2_ir_lock, flags);
351 irq_iommu = valid_irq_2_iommu(irq);
352 if (!irq_iommu) {
353 spin_unlock_irqrestore(&irq_2_ir_lock, flags);
354 return -1;
355 }
356
357 iommu = irq_iommu->iommu;
358
359 index = irq_iommu->irte_index + irq_iommu->sub_handle;
360
361 rc = qi_flush_iec(iommu, index, irq_iommu->irte_mask);
362 spin_unlock_irqrestore(&irq_2_ir_lock, flags);
363
364 return rc;
365}
366
367struct intel_iommu *map_hpet_to_ir(u8 hpet_id) 211struct intel_iommu *map_hpet_to_ir(u8 hpet_id)
368{ 212{
369 int i; 213 int i;
@@ -420,16 +264,14 @@ static int clear_entries(struct irq_2_iommu *irq_iommu)
420 264
421int free_irte(int irq) 265int free_irte(int irq)
422{ 266{
423 int rc = 0; 267 struct irq_2_iommu *irq_iommu = irq_2_iommu(irq);
424 struct irq_2_iommu *irq_iommu;
425 unsigned long flags; 268 unsigned long flags;
269 int rc;
426 270
427 spin_lock_irqsave(&irq_2_ir_lock, flags); 271 if (!irq_iommu)
428 irq_iommu = valid_irq_2_iommu(irq);
429 if (!irq_iommu) {
430 spin_unlock_irqrestore(&irq_2_ir_lock, flags);
431 return -1; 272 return -1;
432 } 273
274 spin_lock_irqsave(&irq_2_ir_lock, flags);
433 275
434 rc = clear_entries(irq_iommu); 276 rc = clear_entries(irq_iommu);
435 277
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 69b7be33b3a2..5fcf5aec680f 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -170,33 +170,31 @@ static void msix_mask_irq(struct msi_desc *desc, u32 flag)
170 desc->masked = __msix_mask_irq(desc, flag); 170 desc->masked = __msix_mask_irq(desc, flag);
171} 171}
172 172
173static void msi_set_mask_bit(unsigned irq, u32 flag) 173static void msi_set_mask_bit(struct irq_data *data, u32 flag)
174{ 174{
175 struct msi_desc *desc = get_irq_msi(irq); 175 struct msi_desc *desc = irq_data_get_msi(data);
176 176
177 if (desc->msi_attrib.is_msix) { 177 if (desc->msi_attrib.is_msix) {
178 msix_mask_irq(desc, flag); 178 msix_mask_irq(desc, flag);
179 readl(desc->mask_base); /* Flush write to device */ 179 readl(desc->mask_base); /* Flush write to device */
180 } else { 180 } else {
181 unsigned offset = irq - desc->dev->irq; 181 unsigned offset = data->irq - desc->dev->irq;
182 msi_mask_irq(desc, 1 << offset, flag << offset); 182 msi_mask_irq(desc, 1 << offset, flag << offset);
183 } 183 }
184} 184}
185 185
186void mask_msi_irq(unsigned int irq) 186void mask_msi_irq(struct irq_data *data)
187{ 187{
188 msi_set_mask_bit(irq, 1); 188 msi_set_mask_bit(data, 1);
189} 189}
190 190
191void unmask_msi_irq(unsigned int irq) 191void unmask_msi_irq(struct irq_data *data)
192{ 192{
193 msi_set_mask_bit(irq, 0); 193 msi_set_mask_bit(data, 0);
194} 194}
195 195
196void read_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg) 196void __read_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
197{ 197{
198 struct msi_desc *entry = get_irq_desc_msi(desc);
199
200 BUG_ON(entry->dev->current_state != PCI_D0); 198 BUG_ON(entry->dev->current_state != PCI_D0);
201 199
202 if (entry->msi_attrib.is_msix) { 200 if (entry->msi_attrib.is_msix) {
@@ -227,15 +225,13 @@ void read_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg)
227 225
228void read_msi_msg(unsigned int irq, struct msi_msg *msg) 226void read_msi_msg(unsigned int irq, struct msi_msg *msg)
229{ 227{
230 struct irq_desc *desc = irq_to_desc(irq); 228 struct msi_desc *entry = get_irq_msi(irq);
231 229
232 read_msi_msg_desc(desc, msg); 230 __read_msi_msg(entry, msg);
233} 231}
234 232
235void get_cached_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg) 233void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
236{ 234{
237 struct msi_desc *entry = get_irq_desc_msi(desc);
238
239 /* Assert that the cache is valid, assuming that 235 /* Assert that the cache is valid, assuming that
240 * valid messages are not all-zeroes. */ 236 * valid messages are not all-zeroes. */
241 BUG_ON(!(entry->msg.address_hi | entry->msg.address_lo | 237 BUG_ON(!(entry->msg.address_hi | entry->msg.address_lo |
@@ -246,15 +242,13 @@ void get_cached_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg)
246 242
247void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg) 243void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg)
248{ 244{
249 struct irq_desc *desc = irq_to_desc(irq); 245 struct msi_desc *entry = get_irq_msi(irq);
250 246
251 get_cached_msi_msg_desc(desc, msg); 247 __get_cached_msi_msg(entry, msg);
252} 248}
253 249
254void write_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg) 250void __write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
255{ 251{
256 struct msi_desc *entry = get_irq_desc_msi(desc);
257
258 if (entry->dev->current_state != PCI_D0) { 252 if (entry->dev->current_state != PCI_D0) {
259 /* Don't touch the hardware now */ 253 /* Don't touch the hardware now */
260 } else if (entry->msi_attrib.is_msix) { 254 } else if (entry->msi_attrib.is_msix) {
@@ -292,9 +286,9 @@ void write_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg)
292 286
293void write_msi_msg(unsigned int irq, struct msi_msg *msg) 287void write_msi_msg(unsigned int irq, struct msi_msg *msg)
294{ 288{
295 struct irq_desc *desc = irq_to_desc(irq); 289 struct msi_desc *entry = get_irq_msi(irq);
296 290
297 write_msi_msg_desc(desc, msg); 291 __write_msi_msg(entry, msg);
298} 292}
299 293
300static void free_msi_irqs(struct pci_dev *dev) 294static void free_msi_irqs(struct pci_dev *dev)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 7c8008225ee3..17927b1f9334 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -127,7 +127,10 @@ static void handle_tx(struct vhost_net *net)
127 size_t len, total_len = 0; 127 size_t len, total_len = 0;
128 int err, wmem; 128 int err, wmem;
129 size_t hdr_size; 129 size_t hdr_size;
130 struct socket *sock = rcu_dereference(vq->private_data); 130 struct socket *sock;
131
132 sock = rcu_dereference_check(vq->private_data,
133 lockdep_is_held(&vq->mutex));
131 if (!sock) 134 if (!sock)
132 return; 135 return;
133 136
@@ -582,7 +585,10 @@ static void vhost_net_disable_vq(struct vhost_net *n,
582static void vhost_net_enable_vq(struct vhost_net *n, 585static void vhost_net_enable_vq(struct vhost_net *n,
583 struct vhost_virtqueue *vq) 586 struct vhost_virtqueue *vq)
584{ 587{
585 struct socket *sock = vq->private_data; 588 struct socket *sock;
589
590 sock = rcu_dereference_protected(vq->private_data,
591 lockdep_is_held(&vq->mutex));
586 if (!sock) 592 if (!sock)
587 return; 593 return;
588 if (vq == n->vqs + VHOST_NET_VQ_TX) { 594 if (vq == n->vqs + VHOST_NET_VQ_TX) {
@@ -598,7 +604,8 @@ static struct socket *vhost_net_stop_vq(struct vhost_net *n,
598 struct socket *sock; 604 struct socket *sock;
599 605
600 mutex_lock(&vq->mutex); 606 mutex_lock(&vq->mutex);
601 sock = vq->private_data; 607 sock = rcu_dereference_protected(vq->private_data,
608 lockdep_is_held(&vq->mutex));
602 vhost_net_disable_vq(n, vq); 609 vhost_net_disable_vq(n, vq);
603 rcu_assign_pointer(vq->private_data, NULL); 610 rcu_assign_pointer(vq->private_data, NULL);
604 mutex_unlock(&vq->mutex); 611 mutex_unlock(&vq->mutex);
@@ -736,7 +743,8 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
736 } 743 }
737 744
738 /* start polling new socket */ 745 /* start polling new socket */
739 oldsock = vq->private_data; 746 oldsock = rcu_dereference_protected(vq->private_data,
747 lockdep_is_held(&vq->mutex));
740 if (sock != oldsock) { 748 if (sock != oldsock) {
741 vhost_net_disable_vq(n, vq); 749 vhost_net_disable_vq(n, vq);
742 rcu_assign_pointer(vq->private_data, sock); 750 rcu_assign_pointer(vq->private_data, sock);
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index dd3d6f7406f8..8b5a1b33d0fe 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -320,7 +320,7 @@ long vhost_dev_reset_owner(struct vhost_dev *dev)
320 vhost_dev_cleanup(dev); 320 vhost_dev_cleanup(dev);
321 321
322 memory->nregions = 0; 322 memory->nregions = 0;
323 dev->memory = memory; 323 RCU_INIT_POINTER(dev->memory, memory);
324 return 0; 324 return 0;
325} 325}
326 326
@@ -352,8 +352,9 @@ void vhost_dev_cleanup(struct vhost_dev *dev)
352 fput(dev->log_file); 352 fput(dev->log_file);
353 dev->log_file = NULL; 353 dev->log_file = NULL;
354 /* No one will access memory at this point */ 354 /* No one will access memory at this point */
355 kfree(dev->memory); 355 kfree(rcu_dereference_protected(dev->memory,
356 dev->memory = NULL; 356 lockdep_is_held(&dev->mutex)));
357 RCU_INIT_POINTER(dev->memory, NULL);
357 if (dev->mm) 358 if (dev->mm)
358 mmput(dev->mm); 359 mmput(dev->mm);
359 dev->mm = NULL; 360 dev->mm = NULL;
@@ -440,14 +441,22 @@ static int vq_access_ok(unsigned int num,
440/* Caller should have device mutex but not vq mutex */ 441/* Caller should have device mutex but not vq mutex */
441int vhost_log_access_ok(struct vhost_dev *dev) 442int vhost_log_access_ok(struct vhost_dev *dev)
442{ 443{
443 return memory_access_ok(dev, dev->memory, 1); 444 struct vhost_memory *mp;
445
446 mp = rcu_dereference_protected(dev->memory,
447 lockdep_is_held(&dev->mutex));
448 return memory_access_ok(dev, mp, 1);
444} 449}
445 450
446/* Verify access for write logging. */ 451/* Verify access for write logging. */
447/* Caller should have vq mutex and device mutex */ 452/* Caller should have vq mutex and device mutex */
448static int vq_log_access_ok(struct vhost_virtqueue *vq, void __user *log_base) 453static int vq_log_access_ok(struct vhost_virtqueue *vq, void __user *log_base)
449{ 454{
450 return vq_memory_access_ok(log_base, vq->dev->memory, 455 struct vhost_memory *mp;
456
457 mp = rcu_dereference_protected(vq->dev->memory,
458 lockdep_is_held(&vq->mutex));
459 return vq_memory_access_ok(log_base, mp,
451 vhost_has_feature(vq->dev, VHOST_F_LOG_ALL)) && 460 vhost_has_feature(vq->dev, VHOST_F_LOG_ALL)) &&
452 (!vq->log_used || log_access_ok(log_base, vq->log_addr, 461 (!vq->log_used || log_access_ok(log_base, vq->log_addr,
453 sizeof *vq->used + 462 sizeof *vq->used +
@@ -487,7 +496,8 @@ static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m)
487 kfree(newmem); 496 kfree(newmem);
488 return -EFAULT; 497 return -EFAULT;
489 } 498 }
490 oldmem = d->memory; 499 oldmem = rcu_dereference_protected(d->memory,
500 lockdep_is_held(&d->mutex));
491 rcu_assign_pointer(d->memory, newmem); 501 rcu_assign_pointer(d->memory, newmem);
492 synchronize_rcu(); 502 synchronize_rcu();
493 kfree(oldmem); 503 kfree(oldmem);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index afd77295971c..af3c11ded5fd 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -106,7 +106,7 @@ struct vhost_virtqueue {
106 * vhost_work execution acts instead of rcu_read_lock() and the end of 106 * vhost_work execution acts instead of rcu_read_lock() and the end of
107 * vhost_work execution acts instead of rcu_read_lock(). 107 * vhost_work execution acts instead of rcu_read_lock().
108 * Writers use virtqueue mutex. */ 108 * Writers use virtqueue mutex. */
109 void *private_data; 109 void __rcu *private_data;
110 /* Log write descriptors */ 110 /* Log write descriptors */
111 void __user *log_base; 111 void __user *log_base;
112 struct vhost_log log[VHOST_NET_MAX_SG]; 112 struct vhost_log log[VHOST_NET_MAX_SG];
@@ -116,7 +116,7 @@ struct vhost_dev {
116 /* Readers use RCU to access memory table pointer 116 /* Readers use RCU to access memory table pointer
117 * log base pointer and features. 117 * log base pointer and features.
118 * Writers use mutex below.*/ 118 * Writers use mutex below.*/
119 struct vhost_memory *memory; 119 struct vhost_memory __rcu *memory;
120 struct mm_struct *mm; 120 struct mm_struct *mm;
121 struct mutex mutex; 121 struct mutex mutex;
122 unsigned acked_features; 122 unsigned acked_features;
@@ -173,7 +173,11 @@ enum {
173 173
174static inline int vhost_has_feature(struct vhost_dev *dev, int bit) 174static inline int vhost_has_feature(struct vhost_dev *dev, int bit)
175{ 175{
176 unsigned acked_features = rcu_dereference(dev->acked_features); 176 unsigned acked_features;
177
178 acked_features =
179 rcu_dereference_index_check(dev->acked_features,
180 lockdep_is_held(&dev->mutex));
177 return acked_features & (1 << bit); 181 return acked_features & (1 << bit);
178} 182}
179 183
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 13365ba35218..7d24b0d94ed4 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -338,30 +338,29 @@ static void unmask_evtchn(int port)
338 338
339static int find_unbound_irq(void) 339static int find_unbound_irq(void)
340{ 340{
341 int irq; 341 struct irq_data *data;
342 struct irq_desc *desc; 342 int irq, res;
343 343
344 for (irq = 0; irq < nr_irqs; irq++) { 344 for (irq = 0; irq < nr_irqs; irq++) {
345 desc = irq_to_desc(irq); 345 data = irq_get_irq_data(irq);
346 /* only 0->15 have init'd desc; handle irq > 16 */ 346 /* only 0->15 have init'd desc; handle irq > 16 */
347 if (desc == NULL) 347 if (!data)
348 break; 348 break;
349 if (desc->chip == &no_irq_chip) 349 if (data->chip == &no_irq_chip)
350 break; 350 break;
351 if (desc->chip != &xen_dynamic_chip) 351 if (data->chip != &xen_dynamic_chip)
352 continue; 352 continue;
353 if (irq_info[irq].type == IRQT_UNBOUND) 353 if (irq_info[irq].type == IRQT_UNBOUND)
354 break; 354 return irq;
355 } 355 }
356 356
357 if (irq == nr_irqs) 357 if (irq == nr_irqs)
358 panic("No available IRQ to bind to: increase nr_irqs!\n"); 358 panic("No available IRQ to bind to: increase nr_irqs!\n");
359 359
360 desc = irq_to_desc_alloc_node(irq, 0); 360 res = irq_alloc_desc_at(irq, 0);
361 if (WARN_ON(desc == NULL))
362 return -1;
363 361
364 dynamic_irq_init_keep_chip_data(irq); 362 if (WARN_ON(res != irq))
363 return -1;
365 364
366 return irq; 365 return irq;
367} 366}
@@ -495,7 +494,7 @@ static void unbind_from_irq(unsigned int irq)
495 if (irq_info[irq].type != IRQT_UNBOUND) { 494 if (irq_info[irq].type != IRQT_UNBOUND) {
496 irq_info[irq] = mk_unbound_info(); 495 irq_info[irq] = mk_unbound_info();
497 496
498 dynamic_irq_cleanup(irq); 497 irq_free_desc(irq);
499 } 498 }
500 499
501 spin_unlock(&irq_mapping_update_lock); 500 spin_unlock(&irq_mapping_update_lock);
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 33c4e7eef470..9581ea94d5a1 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -109,8 +109,8 @@ static void init_once(void *foo)
109{ 109{
110 struct affs_inode_info *ei = (struct affs_inode_info *) foo; 110 struct affs_inode_info *ei = (struct affs_inode_info *) foo;
111 111
112 init_MUTEX(&ei->i_link_lock); 112 sema_init(&ei->i_link_lock, 1);
113 init_MUTEX(&ei->i_ext_lock); 113 sema_init(&ei->i_ext_lock, 1);
114 inode_init_once(&ei->vfs_inode); 114 inode_init_once(&ei->vfs_inode);
115} 115}
116 116
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
index 0fcd2640c23f..9eb134ea6eb2 100644
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig
@@ -1,9 +1,11 @@
1config CEPH_FS 1config CEPH_FS
2 tristate "Ceph distributed file system (EXPERIMENTAL)" 2 tristate "Ceph distributed file system (EXPERIMENTAL)"
3 depends on INET && EXPERIMENTAL 3 depends on INET && EXPERIMENTAL
4 select CEPH_LIB
4 select LIBCRC32C 5 select LIBCRC32C
5 select CRYPTO_AES 6 select CRYPTO_AES
6 select CRYPTO 7 select CRYPTO
8 default n
7 help 9 help
8 Choose Y or M here to include support for mounting the 10 Choose Y or M here to include support for mounting the
9 experimental Ceph distributed file system. Ceph is an extremely 11 experimental Ceph distributed file system. Ceph is an extremely
@@ -14,15 +16,3 @@ config CEPH_FS
14 16
15 If unsure, say N. 17 If unsure, say N.
16 18
17config CEPH_FS_PRETTYDEBUG
18 bool "Include file:line in ceph debug output"
19 depends on CEPH_FS
20 default n
21 help
22 If you say Y here, debug output will include a filename and
23 line to aid debugging. This icnreases kernel size and slows
24 execution slightly when debug call sites are enabled (e.g.,
25 via CONFIG_DYNAMIC_DEBUG).
26
27 If unsure, say N.
28
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index 278e1172600d..9e6c4f2e8ff1 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -8,15 +8,8 @@ obj-$(CONFIG_CEPH_FS) += ceph.o
8 8
9ceph-objs := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ 9ceph-objs := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
10 export.o caps.o snap.o xattr.o \ 10 export.o caps.o snap.o xattr.o \
11 messenger.o msgpool.o buffer.o pagelist.o \ 11 mds_client.o mdsmap.o strings.o ceph_frag.o \
12 mds_client.o mdsmap.o \ 12 debugfs.o
13 mon_client.o \
14 osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
15 debugfs.o \
16 auth.o auth_none.o \
17 crypto.o armor.o \
18 auth_x.o \
19 ceph_fs.o ceph_strings.o ceph_hash.o ceph_frag.o
20 13
21else 14else
22#Otherwise we were called directly from the command 15#Otherwise we were called directly from the command
diff --git a/fs/ceph/README b/fs/ceph/README
deleted file mode 100644
index 18352fab37c0..000000000000
--- a/fs/ceph/README
+++ /dev/null
@@ -1,20 +0,0 @@
1#
2# The following files are shared by (and manually synchronized
3# between) the Ceph userland and kernel client.
4#
5# userland kernel
6src/include/ceph_fs.h fs/ceph/ceph_fs.h
7src/include/ceph_fs.cc fs/ceph/ceph_fs.c
8src/include/msgr.h fs/ceph/msgr.h
9src/include/rados.h fs/ceph/rados.h
10src/include/ceph_strings.cc fs/ceph/ceph_strings.c
11src/include/ceph_frag.h fs/ceph/ceph_frag.h
12src/include/ceph_frag.cc fs/ceph/ceph_frag.c
13src/include/ceph_hash.h fs/ceph/ceph_hash.h
14src/include/ceph_hash.cc fs/ceph/ceph_hash.c
15src/crush/crush.c fs/ceph/crush/crush.c
16src/crush/crush.h fs/ceph/crush/crush.h
17src/crush/mapper.c fs/ceph/crush/mapper.c
18src/crush/mapper.h fs/ceph/crush/mapper.h
19src/crush/hash.h fs/ceph/crush/hash.h
20src/crush/hash.c fs/ceph/crush/hash.c
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index efbc604001c8..51bcc5ce3230 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1,4 +1,4 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/backing-dev.h> 3#include <linux/backing-dev.h>
4#include <linux/fs.h> 4#include <linux/fs.h>
@@ -10,7 +10,8 @@
10#include <linux/task_io_accounting_ops.h> 10#include <linux/task_io_accounting_ops.h>
11 11
12#include "super.h" 12#include "super.h"
13#include "osd_client.h" 13#include "mds_client.h"
14#include <linux/ceph/osd_client.h>
14 15
15/* 16/*
16 * Ceph address space ops. 17 * Ceph address space ops.
@@ -193,7 +194,8 @@ static int readpage_nounlock(struct file *filp, struct page *page)
193{ 194{
194 struct inode *inode = filp->f_dentry->d_inode; 195 struct inode *inode = filp->f_dentry->d_inode;
195 struct ceph_inode_info *ci = ceph_inode(inode); 196 struct ceph_inode_info *ci = ceph_inode(inode);
196 struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc; 197 struct ceph_osd_client *osdc =
198 &ceph_inode_to_client(inode)->client->osdc;
197 int err = 0; 199 int err = 0;
198 u64 len = PAGE_CACHE_SIZE; 200 u64 len = PAGE_CACHE_SIZE;
199 201
@@ -265,7 +267,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
265{ 267{
266 struct inode *inode = file->f_dentry->d_inode; 268 struct inode *inode = file->f_dentry->d_inode;
267 struct ceph_inode_info *ci = ceph_inode(inode); 269 struct ceph_inode_info *ci = ceph_inode(inode);
268 struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc; 270 struct ceph_osd_client *osdc =
271 &ceph_inode_to_client(inode)->client->osdc;
269 int rc = 0; 272 int rc = 0;
270 struct page **pages; 273 struct page **pages;
271 loff_t offset; 274 loff_t offset;
@@ -365,7 +368,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
365{ 368{
366 struct inode *inode; 369 struct inode *inode;
367 struct ceph_inode_info *ci; 370 struct ceph_inode_info *ci;
368 struct ceph_client *client; 371 struct ceph_fs_client *fsc;
369 struct ceph_osd_client *osdc; 372 struct ceph_osd_client *osdc;
370 loff_t page_off = page->index << PAGE_CACHE_SHIFT; 373 loff_t page_off = page->index << PAGE_CACHE_SHIFT;
371 int len = PAGE_CACHE_SIZE; 374 int len = PAGE_CACHE_SIZE;
@@ -383,8 +386,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
383 } 386 }
384 inode = page->mapping->host; 387 inode = page->mapping->host;
385 ci = ceph_inode(inode); 388 ci = ceph_inode(inode);
386 client = ceph_inode_to_client(inode); 389 fsc = ceph_inode_to_client(inode);
387 osdc = &client->osdc; 390 osdc = &fsc->client->osdc;
388 391
389 /* verify this is a writeable snap context */ 392 /* verify this is a writeable snap context */
390 snapc = (void *)page->private; 393 snapc = (void *)page->private;
@@ -414,10 +417,10 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
414 dout("writepage %p page %p index %lu on %llu~%u snapc %p\n", 417 dout("writepage %p page %p index %lu on %llu~%u snapc %p\n",
415 inode, page, page->index, page_off, len, snapc); 418 inode, page, page->index, page_off, len, snapc);
416 419
417 writeback_stat = atomic_long_inc_return(&client->writeback_count); 420 writeback_stat = atomic_long_inc_return(&fsc->writeback_count);
418 if (writeback_stat > 421 if (writeback_stat >
419 CONGESTION_ON_THRESH(client->mount_args->congestion_kb)) 422 CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
420 set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC); 423 set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC);
421 424
422 set_page_writeback(page); 425 set_page_writeback(page);
423 err = ceph_osdc_writepages(osdc, ceph_vino(inode), 426 err = ceph_osdc_writepages(osdc, ceph_vino(inode),
@@ -496,7 +499,7 @@ static void writepages_finish(struct ceph_osd_request *req,
496 struct address_space *mapping = inode->i_mapping; 499 struct address_space *mapping = inode->i_mapping;
497 __s32 rc = -EIO; 500 __s32 rc = -EIO;
498 u64 bytes = 0; 501 u64 bytes = 0;
499 struct ceph_client *client = ceph_inode_to_client(inode); 502 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
500 long writeback_stat; 503 long writeback_stat;
501 unsigned issued = ceph_caps_issued(ci); 504 unsigned issued = ceph_caps_issued(ci);
502 505
@@ -529,10 +532,10 @@ static void writepages_finish(struct ceph_osd_request *req,
529 WARN_ON(!PageUptodate(page)); 532 WARN_ON(!PageUptodate(page));
530 533
531 writeback_stat = 534 writeback_stat =
532 atomic_long_dec_return(&client->writeback_count); 535 atomic_long_dec_return(&fsc->writeback_count);
533 if (writeback_stat < 536 if (writeback_stat <
534 CONGESTION_OFF_THRESH(client->mount_args->congestion_kb)) 537 CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))
535 clear_bdi_congested(&client->backing_dev_info, 538 clear_bdi_congested(&fsc->backing_dev_info,
536 BLK_RW_ASYNC); 539 BLK_RW_ASYNC);
537 540
538 ceph_put_snap_context((void *)page->private); 541 ceph_put_snap_context((void *)page->private);
@@ -569,13 +572,13 @@ static void writepages_finish(struct ceph_osd_request *req,
569 * mempool. we avoid the mempool if we can because req->r_num_pages 572 * mempool. we avoid the mempool if we can because req->r_num_pages
570 * may be less than the maximum write size. 573 * may be less than the maximum write size.
571 */ 574 */
572static void alloc_page_vec(struct ceph_client *client, 575static void alloc_page_vec(struct ceph_fs_client *fsc,
573 struct ceph_osd_request *req) 576 struct ceph_osd_request *req)
574{ 577{
575 req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages, 578 req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages,
576 GFP_NOFS); 579 GFP_NOFS);
577 if (!req->r_pages) { 580 if (!req->r_pages) {
578 req->r_pages = mempool_alloc(client->wb_pagevec_pool, GFP_NOFS); 581 req->r_pages = mempool_alloc(fsc->wb_pagevec_pool, GFP_NOFS);
579 req->r_pages_from_pool = 1; 582 req->r_pages_from_pool = 1;
580 WARN_ON(!req->r_pages); 583 WARN_ON(!req->r_pages);
581 } 584 }
@@ -590,7 +593,7 @@ static int ceph_writepages_start(struct address_space *mapping,
590 struct inode *inode = mapping->host; 593 struct inode *inode = mapping->host;
591 struct backing_dev_info *bdi = mapping->backing_dev_info; 594 struct backing_dev_info *bdi = mapping->backing_dev_info;
592 struct ceph_inode_info *ci = ceph_inode(inode); 595 struct ceph_inode_info *ci = ceph_inode(inode);
593 struct ceph_client *client; 596 struct ceph_fs_client *fsc;
594 pgoff_t index, start, end; 597 pgoff_t index, start, end;
595 int range_whole = 0; 598 int range_whole = 0;
596 int should_loop = 1; 599 int should_loop = 1;
@@ -617,13 +620,13 @@ static int ceph_writepages_start(struct address_space *mapping,
617 wbc->sync_mode == WB_SYNC_NONE ? "NONE" : 620 wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
618 (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); 621 (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
619 622
620 client = ceph_inode_to_client(inode); 623 fsc = ceph_inode_to_client(inode);
621 if (client->mount_state == CEPH_MOUNT_SHUTDOWN) { 624 if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) {
622 pr_warning("writepage_start %p on forced umount\n", inode); 625 pr_warning("writepage_start %p on forced umount\n", inode);
623 return -EIO; /* we're in a forced umount, don't write! */ 626 return -EIO; /* we're in a forced umount, don't write! */
624 } 627 }
625 if (client->mount_args->wsize && client->mount_args->wsize < wsize) 628 if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize)
626 wsize = client->mount_args->wsize; 629 wsize = fsc->mount_options->wsize;
627 if (wsize < PAGE_CACHE_SIZE) 630 if (wsize < PAGE_CACHE_SIZE)
628 wsize = PAGE_CACHE_SIZE; 631 wsize = PAGE_CACHE_SIZE;
629 max_pages_ever = wsize >> PAGE_CACHE_SHIFT; 632 max_pages_ever = wsize >> PAGE_CACHE_SHIFT;
@@ -769,7 +772,7 @@ get_more_pages:
769 offset = (unsigned long long)page->index 772 offset = (unsigned long long)page->index
770 << PAGE_CACHE_SHIFT; 773 << PAGE_CACHE_SHIFT;
771 len = wsize; 774 len = wsize;
772 req = ceph_osdc_new_request(&client->osdc, 775 req = ceph_osdc_new_request(&fsc->client->osdc,
773 &ci->i_layout, 776 &ci->i_layout,
774 ceph_vino(inode), 777 ceph_vino(inode),
775 offset, &len, 778 offset, &len,
@@ -782,7 +785,7 @@ get_more_pages:
782 &inode->i_mtime, true, 1); 785 &inode->i_mtime, true, 1);
783 max_pages = req->r_num_pages; 786 max_pages = req->r_num_pages;
784 787
785 alloc_page_vec(client, req); 788 alloc_page_vec(fsc, req);
786 req->r_callback = writepages_finish; 789 req->r_callback = writepages_finish;
787 req->r_inode = inode; 790 req->r_inode = inode;
788 } 791 }
@@ -794,10 +797,10 @@ get_more_pages:
794 inode, page, page->index); 797 inode, page, page->index);
795 798
796 writeback_stat = 799 writeback_stat =
797 atomic_long_inc_return(&client->writeback_count); 800 atomic_long_inc_return(&fsc->writeback_count);
798 if (writeback_stat > CONGESTION_ON_THRESH( 801 if (writeback_stat > CONGESTION_ON_THRESH(
799 client->mount_args->congestion_kb)) { 802 fsc->mount_options->congestion_kb)) {
800 set_bdi_congested(&client->backing_dev_info, 803 set_bdi_congested(&fsc->backing_dev_info,
801 BLK_RW_ASYNC); 804 BLK_RW_ASYNC);
802 } 805 }
803 806
@@ -846,7 +849,7 @@ get_more_pages:
846 op->payload_len = cpu_to_le32(len); 849 op->payload_len = cpu_to_le32(len);
847 req->r_request->hdr.data_len = cpu_to_le32(len); 850 req->r_request->hdr.data_len = cpu_to_le32(len);
848 851
849 ceph_osdc_start_request(&client->osdc, req, true); 852 ceph_osdc_start_request(&fsc->client->osdc, req, true);
850 req = NULL; 853 req = NULL;
851 854
852 /* continue? */ 855 /* continue? */
@@ -915,7 +918,7 @@ static int ceph_update_writeable_page(struct file *file,
915{ 918{
916 struct inode *inode = file->f_dentry->d_inode; 919 struct inode *inode = file->f_dentry->d_inode;
917 struct ceph_inode_info *ci = ceph_inode(inode); 920 struct ceph_inode_info *ci = ceph_inode(inode);
918 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; 921 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
919 loff_t page_off = pos & PAGE_CACHE_MASK; 922 loff_t page_off = pos & PAGE_CACHE_MASK;
920 int pos_in_page = pos & ~PAGE_CACHE_MASK; 923 int pos_in_page = pos & ~PAGE_CACHE_MASK;
921 int end_in_page = pos_in_page + len; 924 int end_in_page = pos_in_page + len;
@@ -1053,8 +1056,8 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
1053 struct page *page, void *fsdata) 1056 struct page *page, void *fsdata)
1054{ 1057{
1055 struct inode *inode = file->f_dentry->d_inode; 1058 struct inode *inode = file->f_dentry->d_inode;
1056 struct ceph_client *client = ceph_inode_to_client(inode); 1059 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
1057 struct ceph_mds_client *mdsc = &client->mdsc; 1060 struct ceph_mds_client *mdsc = fsc->mdsc;
1058 unsigned from = pos & (PAGE_CACHE_SIZE - 1); 1061 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
1059 int check_cap = 0; 1062 int check_cap = 0;
1060 1063
@@ -1123,7 +1126,7 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1123{ 1126{
1124 struct inode *inode = vma->vm_file->f_dentry->d_inode; 1127 struct inode *inode = vma->vm_file->f_dentry->d_inode;
1125 struct page *page = vmf->page; 1128 struct page *page = vmf->page;
1126 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; 1129 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
1127 loff_t off = page->index << PAGE_CACHE_SHIFT; 1130 loff_t off = page->index << PAGE_CACHE_SHIFT;
1128 loff_t size, len; 1131 loff_t size, len;
1129 int ret; 1132 int ret;
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 5e9da996a151..98ab13e2b71d 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1,4 +1,4 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/fs.h> 3#include <linux/fs.h>
4#include <linux/kernel.h> 4#include <linux/kernel.h>
@@ -9,8 +9,9 @@
9#include <linux/writeback.h> 9#include <linux/writeback.h>
10 10
11#include "super.h" 11#include "super.h"
12#include "decode.h" 12#include "mds_client.h"
13#include "messenger.h" 13#include <linux/ceph/decode.h>
14#include <linux/ceph/messenger.h>
14 15
15/* 16/*
16 * Capability management 17 * Capability management
@@ -287,11 +288,11 @@ void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap)
287 spin_unlock(&mdsc->caps_list_lock); 288 spin_unlock(&mdsc->caps_list_lock);
288} 289}
289 290
290void ceph_reservation_status(struct ceph_client *client, 291void ceph_reservation_status(struct ceph_fs_client *fsc,
291 int *total, int *avail, int *used, int *reserved, 292 int *total, int *avail, int *used, int *reserved,
292 int *min) 293 int *min)
293{ 294{
294 struct ceph_mds_client *mdsc = &client->mdsc; 295 struct ceph_mds_client *mdsc = fsc->mdsc;
295 296
296 if (total) 297 if (total)
297 *total = mdsc->caps_total_count; 298 *total = mdsc->caps_total_count;
@@ -399,7 +400,7 @@ static void __insert_cap_node(struct ceph_inode_info *ci,
399static void __cap_set_timeouts(struct ceph_mds_client *mdsc, 400static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
400 struct ceph_inode_info *ci) 401 struct ceph_inode_info *ci)
401{ 402{
402 struct ceph_mount_args *ma = mdsc->client->mount_args; 403 struct ceph_mount_options *ma = mdsc->fsc->mount_options;
403 404
404 ci->i_hold_caps_min = round_jiffies(jiffies + 405 ci->i_hold_caps_min = round_jiffies(jiffies +
405 ma->caps_wanted_delay_min * HZ); 406 ma->caps_wanted_delay_min * HZ);
@@ -515,7 +516,7 @@ int ceph_add_cap(struct inode *inode,
515 unsigned seq, unsigned mseq, u64 realmino, int flags, 516 unsigned seq, unsigned mseq, u64 realmino, int flags,
516 struct ceph_cap_reservation *caps_reservation) 517 struct ceph_cap_reservation *caps_reservation)
517{ 518{
518 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; 519 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
519 struct ceph_inode_info *ci = ceph_inode(inode); 520 struct ceph_inode_info *ci = ceph_inode(inode);
520 struct ceph_cap *new_cap = NULL; 521 struct ceph_cap *new_cap = NULL;
521 struct ceph_cap *cap; 522 struct ceph_cap *cap;
@@ -873,7 +874,7 @@ void __ceph_remove_cap(struct ceph_cap *cap)
873 struct ceph_mds_session *session = cap->session; 874 struct ceph_mds_session *session = cap->session;
874 struct ceph_inode_info *ci = cap->ci; 875 struct ceph_inode_info *ci = cap->ci;
875 struct ceph_mds_client *mdsc = 876 struct ceph_mds_client *mdsc =
876 &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; 877 ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
877 int removed = 0; 878 int removed = 0;
878 879
879 dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode); 880 dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
@@ -1210,7 +1211,7 @@ void __ceph_flush_snaps(struct ceph_inode_info *ci,
1210 int mds; 1211 int mds;
1211 struct ceph_cap_snap *capsnap; 1212 struct ceph_cap_snap *capsnap;
1212 u32 mseq; 1213 u32 mseq;
1213 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc; 1214 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
1214 struct ceph_mds_session *session = NULL; /* if session != NULL, we hold 1215 struct ceph_mds_session *session = NULL; /* if session != NULL, we hold
1215 session->s_mutex */ 1216 session->s_mutex */
1216 u64 next_follows = 0; /* keep track of how far we've gotten through the 1217 u64 next_follows = 0; /* keep track of how far we've gotten through the
@@ -1336,7 +1337,7 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci)
1336void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) 1337void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
1337{ 1338{
1338 struct ceph_mds_client *mdsc = 1339 struct ceph_mds_client *mdsc =
1339 &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; 1340 ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
1340 struct inode *inode = &ci->vfs_inode; 1341 struct inode *inode = &ci->vfs_inode;
1341 int was = ci->i_dirty_caps; 1342 int was = ci->i_dirty_caps;
1342 int dirty = 0; 1343 int dirty = 0;
@@ -1378,7 +1379,7 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
1378static int __mark_caps_flushing(struct inode *inode, 1379static int __mark_caps_flushing(struct inode *inode,
1379 struct ceph_mds_session *session) 1380 struct ceph_mds_session *session)
1380{ 1381{
1381 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; 1382 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
1382 struct ceph_inode_info *ci = ceph_inode(inode); 1383 struct ceph_inode_info *ci = ceph_inode(inode);
1383 int flushing; 1384 int flushing;
1384 1385
@@ -1416,17 +1417,6 @@ static int __mark_caps_flushing(struct inode *inode,
1416/* 1417/*
1417 * try to invalidate mapping pages without blocking. 1418 * try to invalidate mapping pages without blocking.
1418 */ 1419 */
1419static int mapping_is_empty(struct address_space *mapping)
1420{
1421 struct page *page = find_get_page(mapping, 0);
1422
1423 if (!page)
1424 return 1;
1425
1426 put_page(page);
1427 return 0;
1428}
1429
1430static int try_nonblocking_invalidate(struct inode *inode) 1420static int try_nonblocking_invalidate(struct inode *inode)
1431{ 1421{
1432 struct ceph_inode_info *ci = ceph_inode(inode); 1422 struct ceph_inode_info *ci = ceph_inode(inode);
@@ -1436,7 +1426,7 @@ static int try_nonblocking_invalidate(struct inode *inode)
1436 invalidate_mapping_pages(&inode->i_data, 0, -1); 1426 invalidate_mapping_pages(&inode->i_data, 0, -1);
1437 spin_lock(&inode->i_lock); 1427 spin_lock(&inode->i_lock);
1438 1428
1439 if (mapping_is_empty(&inode->i_data) && 1429 if (inode->i_data.nrpages == 0 &&
1440 invalidating_gen == ci->i_rdcache_gen) { 1430 invalidating_gen == ci->i_rdcache_gen) {
1441 /* success. */ 1431 /* success. */
1442 dout("try_nonblocking_invalidate %p success\n", inode); 1432 dout("try_nonblocking_invalidate %p success\n", inode);
@@ -1462,8 +1452,8 @@ static int try_nonblocking_invalidate(struct inode *inode)
1462void ceph_check_caps(struct ceph_inode_info *ci, int flags, 1452void ceph_check_caps(struct ceph_inode_info *ci, int flags,
1463 struct ceph_mds_session *session) 1453 struct ceph_mds_session *session)
1464{ 1454{
1465 struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode); 1455 struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
1466 struct ceph_mds_client *mdsc = &client->mdsc; 1456 struct ceph_mds_client *mdsc = fsc->mdsc;
1467 struct inode *inode = &ci->vfs_inode; 1457 struct inode *inode = &ci->vfs_inode;
1468 struct ceph_cap *cap; 1458 struct ceph_cap *cap;
1469 int file_wanted, used; 1459 int file_wanted, used;
@@ -1533,7 +1523,7 @@ retry_locked:
1533 */ 1523 */
1534 if ((!is_delayed || mdsc->stopping) && 1524 if ((!is_delayed || mdsc->stopping) &&
1535 ci->i_wrbuffer_ref == 0 && /* no dirty pages... */ 1525 ci->i_wrbuffer_ref == 0 && /* no dirty pages... */
1536 ci->i_rdcache_gen && /* may have cached pages */ 1526 inode->i_data.nrpages && /* have cached pages */
1537 (file_wanted == 0 || /* no open files */ 1527 (file_wanted == 0 || /* no open files */
1538 (revoking & (CEPH_CAP_FILE_CACHE| 1528 (revoking & (CEPH_CAP_FILE_CACHE|
1539 CEPH_CAP_FILE_LAZYIO))) && /* or revoking cache */ 1529 CEPH_CAP_FILE_LAZYIO))) && /* or revoking cache */
@@ -1706,7 +1696,7 @@ ack:
1706static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session, 1696static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
1707 unsigned *flush_tid) 1697 unsigned *flush_tid)
1708{ 1698{
1709 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; 1699 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
1710 struct ceph_inode_info *ci = ceph_inode(inode); 1700 struct ceph_inode_info *ci = ceph_inode(inode);
1711 int unlock_session = session ? 0 : 1; 1701 int unlock_session = session ? 0 : 1;
1712 int flushing = 0; 1702 int flushing = 0;
@@ -1872,7 +1862,7 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
1872 caps_are_flushed(inode, flush_tid)); 1862 caps_are_flushed(inode, flush_tid));
1873 } else { 1863 } else {
1874 struct ceph_mds_client *mdsc = 1864 struct ceph_mds_client *mdsc =
1875 &ceph_sb_to_client(inode->i_sb)->mdsc; 1865 ceph_sb_to_client(inode->i_sb)->mdsc;
1876 1866
1877 spin_lock(&inode->i_lock); 1867 spin_lock(&inode->i_lock);
1878 if (__ceph_caps_dirty(ci)) 1868 if (__ceph_caps_dirty(ci))
@@ -2465,7 +2455,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
2465 __releases(inode->i_lock) 2455 __releases(inode->i_lock)
2466{ 2456{
2467 struct ceph_inode_info *ci = ceph_inode(inode); 2457 struct ceph_inode_info *ci = ceph_inode(inode);
2468 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; 2458 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
2469 unsigned seq = le32_to_cpu(m->seq); 2459 unsigned seq = le32_to_cpu(m->seq);
2470 int dirty = le32_to_cpu(m->dirty); 2460 int dirty = le32_to_cpu(m->dirty);
2471 int cleaned = 0; 2461 int cleaned = 0;
@@ -2713,7 +2703,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
2713 struct ceph_msg *msg) 2703 struct ceph_msg *msg)
2714{ 2704{
2715 struct ceph_mds_client *mdsc = session->s_mdsc; 2705 struct ceph_mds_client *mdsc = session->s_mdsc;
2716 struct super_block *sb = mdsc->client->sb; 2706 struct super_block *sb = mdsc->fsc->sb;
2717 struct inode *inode; 2707 struct inode *inode;
2718 struct ceph_cap *cap; 2708 struct ceph_cap *cap;
2719 struct ceph_mds_caps *h; 2709 struct ceph_mds_caps *h;
diff --git a/fs/ceph/ceph_frag.c b/fs/ceph/ceph_frag.c
index ab6cf35c4091..bdce8b1fbd06 100644
--- a/fs/ceph/ceph_frag.c
+++ b/fs/ceph/ceph_frag.c
@@ -1,7 +1,8 @@
1/* 1/*
2 * Ceph 'frag' type 2 * Ceph 'frag' type
3 */ 3 */
4#include "types.h" 4#include <linux/module.h>
5#include <linux/ceph/types.h>
5 6
6int ceph_frag_compare(__u32 a, __u32 b) 7int ceph_frag_compare(__u32 a, __u32 b)
7{ 8{
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 6fd8b20a8611..7ae1b3d55b58 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -1,4 +1,4 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/device.h> 3#include <linux/device.h>
4#include <linux/slab.h> 4#include <linux/slab.h>
@@ -7,143 +7,49 @@
7#include <linux/debugfs.h> 7#include <linux/debugfs.h>
8#include <linux/seq_file.h> 8#include <linux/seq_file.h>
9 9
10#include <linux/ceph/libceph.h>
11#include <linux/ceph/mon_client.h>
12#include <linux/ceph/auth.h>
13#include <linux/ceph/debugfs.h>
14
10#include "super.h" 15#include "super.h"
11#include "mds_client.h"
12#include "mon_client.h"
13#include "auth.h"
14 16
15#ifdef CONFIG_DEBUG_FS 17#ifdef CONFIG_DEBUG_FS
16 18
17/* 19#include "mds_client.h"
18 * Implement /sys/kernel/debug/ceph fun
19 *
20 * /sys/kernel/debug/ceph/client* - an instance of the ceph client
21 * .../osdmap - current osdmap
22 * .../mdsmap - current mdsmap
23 * .../monmap - current monmap
24 * .../osdc - active osd requests
25 * .../mdsc - active mds requests
26 * .../monc - mon client state
27 * .../dentry_lru - dump contents of dentry lru
28 * .../caps - expose cap (reservation) stats
29 * .../bdi - symlink to ../../bdi/something
30 */
31
32static struct dentry *ceph_debugfs_dir;
33
34static int monmap_show(struct seq_file *s, void *p)
35{
36 int i;
37 struct ceph_client *client = s->private;
38
39 if (client->monc.monmap == NULL)
40 return 0;
41
42 seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
43 for (i = 0; i < client->monc.monmap->num_mon; i++) {
44 struct ceph_entity_inst *inst =
45 &client->monc.monmap->mon_inst[i];
46
47 seq_printf(s, "\t%s%lld\t%s\n",
48 ENTITY_NAME(inst->name),
49 pr_addr(&inst->addr.in_addr));
50 }
51 return 0;
52}
53 20
54static int mdsmap_show(struct seq_file *s, void *p) 21static int mdsmap_show(struct seq_file *s, void *p)
55{ 22{
56 int i; 23 int i;
57 struct ceph_client *client = s->private; 24 struct ceph_fs_client *fsc = s->private;
58 25
59 if (client->mdsc.mdsmap == NULL) 26 if (fsc->mdsc == NULL || fsc->mdsc->mdsmap == NULL)
60 return 0; 27 return 0;
61 seq_printf(s, "epoch %d\n", client->mdsc.mdsmap->m_epoch); 28 seq_printf(s, "epoch %d\n", fsc->mdsc->mdsmap->m_epoch);
62 seq_printf(s, "root %d\n", client->mdsc.mdsmap->m_root); 29 seq_printf(s, "root %d\n", fsc->mdsc->mdsmap->m_root);
63 seq_printf(s, "session_timeout %d\n", 30 seq_printf(s, "session_timeout %d\n",
64 client->mdsc.mdsmap->m_session_timeout); 31 fsc->mdsc->mdsmap->m_session_timeout);
65 seq_printf(s, "session_autoclose %d\n", 32 seq_printf(s, "session_autoclose %d\n",
66 client->mdsc.mdsmap->m_session_autoclose); 33 fsc->mdsc->mdsmap->m_session_autoclose);
67 for (i = 0; i < client->mdsc.mdsmap->m_max_mds; i++) { 34 for (i = 0; i < fsc->mdsc->mdsmap->m_max_mds; i++) {
68 struct ceph_entity_addr *addr = 35 struct ceph_entity_addr *addr =
69 &client->mdsc.mdsmap->m_info[i].addr; 36 &fsc->mdsc->mdsmap->m_info[i].addr;
70 int state = client->mdsc.mdsmap->m_info[i].state; 37 int state = fsc->mdsc->mdsmap->m_info[i].state;
71 38
72 seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, pr_addr(&addr->in_addr), 39 seq_printf(s, "\tmds%d\t%s\t(%s)\n", i,
40 ceph_pr_addr(&addr->in_addr),
73 ceph_mds_state_name(state)); 41 ceph_mds_state_name(state));
74 } 42 }
75 return 0; 43 return 0;
76} 44}
77 45
78static int osdmap_show(struct seq_file *s, void *p) 46/*
79{ 47 * mdsc debugfs
80 int i; 48 */
81 struct ceph_client *client = s->private;
82 struct rb_node *n;
83
84 if (client->osdc.osdmap == NULL)
85 return 0;
86 seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch);
87 seq_printf(s, "flags%s%s\n",
88 (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ?
89 " NEARFULL" : "",
90 (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
91 " FULL" : "");
92 for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
93 struct ceph_pg_pool_info *pool =
94 rb_entry(n, struct ceph_pg_pool_info, node);
95 seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n",
96 pool->id, pool->v.pg_num, pool->pg_num_mask,
97 pool->v.lpg_num, pool->lpg_num_mask);
98 }
99 for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
100 struct ceph_entity_addr *addr =
101 &client->osdc.osdmap->osd_addr[i];
102 int state = client->osdc.osdmap->osd_state[i];
103 char sb[64];
104
105 seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n",
106 i, pr_addr(&addr->in_addr),
107 ((client->osdc.osdmap->osd_weight[i]*100) >> 16),
108 ceph_osdmap_state_str(sb, sizeof(sb), state));
109 }
110 return 0;
111}
112
113static int monc_show(struct seq_file *s, void *p)
114{
115 struct ceph_client *client = s->private;
116 struct ceph_mon_generic_request *req;
117 struct ceph_mon_client *monc = &client->monc;
118 struct rb_node *rp;
119
120 mutex_lock(&monc->mutex);
121
122 if (monc->have_mdsmap)
123 seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap);
124 if (monc->have_osdmap)
125 seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap);
126 if (monc->want_next_osdmap)
127 seq_printf(s, "want next osdmap\n");
128
129 for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
130 __u16 op;
131 req = rb_entry(rp, struct ceph_mon_generic_request, node);
132 op = le16_to_cpu(req->request->hdr.type);
133 if (op == CEPH_MSG_STATFS)
134 seq_printf(s, "%lld statfs\n", req->tid);
135 else
136 seq_printf(s, "%lld unknown\n", req->tid);
137 }
138
139 mutex_unlock(&monc->mutex);
140 return 0;
141}
142
143static int mdsc_show(struct seq_file *s, void *p) 49static int mdsc_show(struct seq_file *s, void *p)
144{ 50{
145 struct ceph_client *client = s->private; 51 struct ceph_fs_client *fsc = s->private;
146 struct ceph_mds_client *mdsc = &client->mdsc; 52 struct ceph_mds_client *mdsc = fsc->mdsc;
147 struct ceph_mds_request *req; 53 struct ceph_mds_request *req;
148 struct rb_node *rp; 54 struct rb_node *rp;
149 int pathlen; 55 int pathlen;
@@ -214,61 +120,12 @@ static int mdsc_show(struct seq_file *s, void *p)
214 return 0; 120 return 0;
215} 121}
216 122
217static int osdc_show(struct seq_file *s, void *pp)
218{
219 struct ceph_client *client = s->private;
220 struct ceph_osd_client *osdc = &client->osdc;
221 struct rb_node *p;
222
223 mutex_lock(&osdc->request_mutex);
224 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
225 struct ceph_osd_request *req;
226 struct ceph_osd_request_head *head;
227 struct ceph_osd_op *op;
228 int num_ops;
229 int opcode, olen;
230 int i;
231
232 req = rb_entry(p, struct ceph_osd_request, r_node);
233
234 seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid,
235 req->r_osd ? req->r_osd->o_osd : -1,
236 le32_to_cpu(req->r_pgid.pool),
237 le16_to_cpu(req->r_pgid.ps));
238
239 head = req->r_request->front.iov_base;
240 op = (void *)(head + 1);
241
242 num_ops = le16_to_cpu(head->num_ops);
243 olen = le32_to_cpu(head->object_len);
244 seq_printf(s, "%.*s", olen,
245 (const char *)(head->ops + num_ops));
246
247 if (req->r_reassert_version.epoch)
248 seq_printf(s, "\t%u'%llu",
249 (unsigned)le32_to_cpu(req->r_reassert_version.epoch),
250 le64_to_cpu(req->r_reassert_version.version));
251 else
252 seq_printf(s, "\t");
253
254 for (i = 0; i < num_ops; i++) {
255 opcode = le16_to_cpu(op->op);
256 seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
257 op++;
258 }
259
260 seq_printf(s, "\n");
261 }
262 mutex_unlock(&osdc->request_mutex);
263 return 0;
264}
265
266static int caps_show(struct seq_file *s, void *p) 123static int caps_show(struct seq_file *s, void *p)
267{ 124{
268 struct ceph_client *client = s->private; 125 struct ceph_fs_client *fsc = s->private;
269 int total, avail, used, reserved, min; 126 int total, avail, used, reserved, min;
270 127
271 ceph_reservation_status(client, &total, &avail, &used, &reserved, &min); 128 ceph_reservation_status(fsc, &total, &avail, &used, &reserved, &min);
272 seq_printf(s, "total\t\t%d\n" 129 seq_printf(s, "total\t\t%d\n"
273 "avail\t\t%d\n" 130 "avail\t\t%d\n"
274 "used\t\t%d\n" 131 "used\t\t%d\n"
@@ -280,8 +137,8 @@ static int caps_show(struct seq_file *s, void *p)
280 137
281static int dentry_lru_show(struct seq_file *s, void *ptr) 138static int dentry_lru_show(struct seq_file *s, void *ptr)
282{ 139{
283 struct ceph_client *client = s->private; 140 struct ceph_fs_client *fsc = s->private;
284 struct ceph_mds_client *mdsc = &client->mdsc; 141 struct ceph_mds_client *mdsc = fsc->mdsc;
285 struct ceph_dentry_info *di; 142 struct ceph_dentry_info *di;
286 143
287 spin_lock(&mdsc->dentry_lru_lock); 144 spin_lock(&mdsc->dentry_lru_lock);
@@ -295,199 +152,124 @@ static int dentry_lru_show(struct seq_file *s, void *ptr)
295 return 0; 152 return 0;
296} 153}
297 154
298#define DEFINE_SHOW_FUNC(name) \ 155CEPH_DEFINE_SHOW_FUNC(mdsmap_show)
299static int name##_open(struct inode *inode, struct file *file) \ 156CEPH_DEFINE_SHOW_FUNC(mdsc_show)
300{ \ 157CEPH_DEFINE_SHOW_FUNC(caps_show)
301 struct seq_file *sf; \ 158CEPH_DEFINE_SHOW_FUNC(dentry_lru_show)
302 int ret; \ 159
303 \
304 ret = single_open(file, name, NULL); \
305 sf = file->private_data; \
306 sf->private = inode->i_private; \
307 return ret; \
308} \
309 \
310static const struct file_operations name##_fops = { \
311 .open = name##_open, \
312 .read = seq_read, \
313 .llseek = seq_lseek, \
314 .release = single_release, \
315};
316
317DEFINE_SHOW_FUNC(monmap_show)
318DEFINE_SHOW_FUNC(mdsmap_show)
319DEFINE_SHOW_FUNC(osdmap_show)
320DEFINE_SHOW_FUNC(monc_show)
321DEFINE_SHOW_FUNC(mdsc_show)
322DEFINE_SHOW_FUNC(osdc_show)
323DEFINE_SHOW_FUNC(dentry_lru_show)
324DEFINE_SHOW_FUNC(caps_show)
325 160
161/*
162 * debugfs
163 */
326static int congestion_kb_set(void *data, u64 val) 164static int congestion_kb_set(void *data, u64 val)
327{ 165{
328 struct ceph_client *client = (struct ceph_client *)data; 166 struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
329
330 if (client)
331 client->mount_args->congestion_kb = (int)val;
332 167
168 fsc->mount_options->congestion_kb = (int)val;
333 return 0; 169 return 0;
334} 170}
335 171
336static int congestion_kb_get(void *data, u64 *val) 172static int congestion_kb_get(void *data, u64 *val)
337{ 173{
338 struct ceph_client *client = (struct ceph_client *)data; 174 struct ceph_fs_client *fsc = (struct ceph_fs_client *)data;
339
340 if (client)
341 *val = (u64)client->mount_args->congestion_kb;
342 175
176 *val = (u64)fsc->mount_options->congestion_kb;
343 return 0; 177 return 0;
344} 178}
345 179
346
347DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get, 180DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get,
348 congestion_kb_set, "%llu\n"); 181 congestion_kb_set, "%llu\n");
349 182
350int __init ceph_debugfs_init(void)
351{
352 ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
353 if (!ceph_debugfs_dir)
354 return -ENOMEM;
355 return 0;
356}
357 183
358void ceph_debugfs_cleanup(void) 184void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
359{ 185{
360 debugfs_remove(ceph_debugfs_dir); 186 dout("ceph_fs_debugfs_cleanup\n");
187 debugfs_remove(fsc->debugfs_bdi);
188 debugfs_remove(fsc->debugfs_congestion_kb);
189 debugfs_remove(fsc->debugfs_mdsmap);
190 debugfs_remove(fsc->debugfs_caps);
191 debugfs_remove(fsc->debugfs_mdsc);
192 debugfs_remove(fsc->debugfs_dentry_lru);
361} 193}
362 194
363int ceph_debugfs_client_init(struct ceph_client *client) 195int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
364{ 196{
365 int ret = 0; 197 char name[100];
366 char name[80]; 198 int err = -ENOMEM;
367
368 snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid,
369 client->monc.auth->global_id);
370 199
371 client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir); 200 dout("ceph_fs_debugfs_init\n");
372 if (!client->debugfs_dir) 201 fsc->debugfs_congestion_kb =
373 goto out; 202 debugfs_create_file("writeback_congestion_kb",
374 203 0600,
375 client->monc.debugfs_file = debugfs_create_file("monc", 204 fsc->client->debugfs_dir,
376 0600, 205 fsc,
377 client->debugfs_dir, 206 &congestion_kb_fops);
378 client, 207 if (!fsc->debugfs_congestion_kb)
379 &monc_show_fops);
380 if (!client->monc.debugfs_file)
381 goto out; 208 goto out;
382 209
383 client->mdsc.debugfs_file = debugfs_create_file("mdsc", 210 dout("a\n");
384 0600,
385 client->debugfs_dir,
386 client,
387 &mdsc_show_fops);
388 if (!client->mdsc.debugfs_file)
389 goto out;
390 211
391 client->osdc.debugfs_file = debugfs_create_file("osdc", 212 snprintf(name, sizeof(name), "../../bdi/%s",
392 0600, 213 dev_name(fsc->backing_dev_info.dev));
393 client->debugfs_dir, 214 fsc->debugfs_bdi =
394 client, 215 debugfs_create_symlink("bdi",
395 &osdc_show_fops); 216 fsc->client->debugfs_dir,
396 if (!client->osdc.debugfs_file) 217 name);
218 if (!fsc->debugfs_bdi)
397 goto out; 219 goto out;
398 220
399 client->debugfs_monmap = debugfs_create_file("monmap", 221 dout("b\n");
222 fsc->debugfs_mdsmap = debugfs_create_file("mdsmap",
400 0600, 223 0600,
401 client->debugfs_dir, 224 fsc->client->debugfs_dir,
402 client, 225 fsc,
403 &monmap_show_fops);
404 if (!client->debugfs_monmap)
405 goto out;
406
407 client->debugfs_mdsmap = debugfs_create_file("mdsmap",
408 0600,
409 client->debugfs_dir,
410 client,
411 &mdsmap_show_fops); 226 &mdsmap_show_fops);
412 if (!client->debugfs_mdsmap) 227 if (!fsc->debugfs_mdsmap)
413 goto out;
414
415 client->debugfs_osdmap = debugfs_create_file("osdmap",
416 0600,
417 client->debugfs_dir,
418 client,
419 &osdmap_show_fops);
420 if (!client->debugfs_osdmap)
421 goto out; 228 goto out;
422 229
423 client->debugfs_dentry_lru = debugfs_create_file("dentry_lru", 230 dout("ca\n");
424 0600, 231 fsc->debugfs_mdsc = debugfs_create_file("mdsc",
425 client->debugfs_dir, 232 0600,
426 client, 233 fsc->client->debugfs_dir,
427 &dentry_lru_show_fops); 234 fsc,
428 if (!client->debugfs_dentry_lru) 235 &mdsc_show_fops);
236 if (!fsc->debugfs_mdsc)
429 goto out; 237 goto out;
430 238
431 client->debugfs_caps = debugfs_create_file("caps", 239 dout("da\n");
240 fsc->debugfs_caps = debugfs_create_file("caps",
432 0400, 241 0400,
433 client->debugfs_dir, 242 fsc->client->debugfs_dir,
434 client, 243 fsc,
435 &caps_show_fops); 244 &caps_show_fops);
436 if (!client->debugfs_caps) 245 if (!fsc->debugfs_caps)
437 goto out; 246 goto out;
438 247
439 client->debugfs_congestion_kb = 248 dout("ea\n");
440 debugfs_create_file("writeback_congestion_kb", 249 fsc->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
441 0600, 250 0600,
442 client->debugfs_dir, 251 fsc->client->debugfs_dir,
443 client, 252 fsc,
444 &congestion_kb_fops); 253 &dentry_lru_show_fops);
445 if (!client->debugfs_congestion_kb) 254 if (!fsc->debugfs_dentry_lru)
446 goto out; 255 goto out;
447 256
448 sprintf(name, "../../bdi/%s", dev_name(client->sb->s_bdi->dev));
449 client->debugfs_bdi = debugfs_create_symlink("bdi", client->debugfs_dir,
450 name);
451
452 return 0; 257 return 0;
453 258
454out: 259out:
455 ceph_debugfs_client_cleanup(client); 260 ceph_fs_debugfs_cleanup(fsc);
456 return ret; 261 return err;
457} 262}
458 263
459void ceph_debugfs_client_cleanup(struct ceph_client *client)
460{
461 debugfs_remove(client->debugfs_bdi);
462 debugfs_remove(client->debugfs_caps);
463 debugfs_remove(client->debugfs_dentry_lru);
464 debugfs_remove(client->debugfs_osdmap);
465 debugfs_remove(client->debugfs_mdsmap);
466 debugfs_remove(client->debugfs_monmap);
467 debugfs_remove(client->osdc.debugfs_file);
468 debugfs_remove(client->mdsc.debugfs_file);
469 debugfs_remove(client->monc.debugfs_file);
470 debugfs_remove(client->debugfs_congestion_kb);
471 debugfs_remove(client->debugfs_dir);
472}
473 264
474#else /* CONFIG_DEBUG_FS */ 265#else /* CONFIG_DEBUG_FS */
475 266
476int __init ceph_debugfs_init(void) 267int ceph_fs_debugfs_init(struct ceph_fs_client *fsc)
477{
478 return 0;
479}
480
481void ceph_debugfs_cleanup(void)
482{
483}
484
485int ceph_debugfs_client_init(struct ceph_client *client)
486{ 268{
487 return 0; 269 return 0;
488} 270}
489 271
490void ceph_debugfs_client_cleanup(struct ceph_client *client) 272void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc)
491{ 273{
492} 274}
493 275
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index a1986eb52045..e0a2dc6fcafc 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1,4 +1,4 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/spinlock.h> 3#include <linux/spinlock.h>
4#include <linux/fs_struct.h> 4#include <linux/fs_struct.h>
@@ -7,6 +7,7 @@
7#include <linux/sched.h> 7#include <linux/sched.h>
8 8
9#include "super.h" 9#include "super.h"
10#include "mds_client.h"
10 11
11/* 12/*
12 * Directory operations: readdir, lookup, create, link, unlink, 13 * Directory operations: readdir, lookup, create, link, unlink,
@@ -94,10 +95,7 @@ static unsigned fpos_off(loff_t p)
94 */ 95 */
95static int __dcache_readdir(struct file *filp, 96static int __dcache_readdir(struct file *filp,
96 void *dirent, filldir_t filldir) 97 void *dirent, filldir_t filldir)
97 __releases(inode->i_lock)
98 __acquires(inode->i_lock)
99{ 98{
100 struct inode *inode = filp->f_dentry->d_inode;
101 struct ceph_file_info *fi = filp->private_data; 99 struct ceph_file_info *fi = filp->private_data;
102 struct dentry *parent = filp->f_dentry; 100 struct dentry *parent = filp->f_dentry;
103 struct inode *dir = parent->d_inode; 101 struct inode *dir = parent->d_inode;
@@ -153,7 +151,6 @@ more:
153 151
154 atomic_inc(&dentry->d_count); 152 atomic_inc(&dentry->d_count);
155 spin_unlock(&dcache_lock); 153 spin_unlock(&dcache_lock);
156 spin_unlock(&inode->i_lock);
157 154
158 dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos, 155 dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
159 dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode); 156 dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
@@ -171,35 +168,30 @@ more:
171 } else { 168 } else {
172 dput(last); 169 dput(last);
173 } 170 }
174 last = NULL;
175 } 171 }
176
177 spin_lock(&inode->i_lock);
178 spin_lock(&dcache_lock);
179
180 last = dentry; 172 last = dentry;
181 173
182 if (err < 0) 174 if (err < 0)
183 goto out_unlock; 175 goto out;
184 176
185 p = p->prev;
186 filp->f_pos++; 177 filp->f_pos++;
187 178
188 /* make sure a dentry wasn't dropped while we didn't have dcache_lock */ 179 /* make sure a dentry wasn't dropped while we didn't have dcache_lock */
189 if ((ceph_inode(dir)->i_ceph_flags & CEPH_I_COMPLETE)) 180 if (!ceph_i_test(dir, CEPH_I_COMPLETE)) {
190 goto more; 181 dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
191 dout(" lost I_COMPLETE on %p; falling back to mds\n", dir); 182 err = -EAGAIN;
192 err = -EAGAIN; 183 goto out;
184 }
185
186 spin_lock(&dcache_lock);
187 p = p->prev; /* advance to next dentry */
188 goto more;
193 189
194out_unlock: 190out_unlock:
195 spin_unlock(&dcache_lock); 191 spin_unlock(&dcache_lock);
196 192out:
197 if (last) { 193 if (last)
198 spin_unlock(&inode->i_lock);
199 dput(last); 194 dput(last);
200 spin_lock(&inode->i_lock);
201 }
202
203 return err; 195 return err;
204} 196}
205 197
@@ -227,15 +219,15 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
227 struct ceph_file_info *fi = filp->private_data; 219 struct ceph_file_info *fi = filp->private_data;
228 struct inode *inode = filp->f_dentry->d_inode; 220 struct inode *inode = filp->f_dentry->d_inode;
229 struct ceph_inode_info *ci = ceph_inode(inode); 221 struct ceph_inode_info *ci = ceph_inode(inode);
230 struct ceph_client *client = ceph_inode_to_client(inode); 222 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
231 struct ceph_mds_client *mdsc = &client->mdsc; 223 struct ceph_mds_client *mdsc = fsc->mdsc;
232 unsigned frag = fpos_frag(filp->f_pos); 224 unsigned frag = fpos_frag(filp->f_pos);
233 int off = fpos_off(filp->f_pos); 225 int off = fpos_off(filp->f_pos);
234 int err; 226 int err;
235 u32 ftype; 227 u32 ftype;
236 struct ceph_mds_reply_info_parsed *rinfo; 228 struct ceph_mds_reply_info_parsed *rinfo;
237 const int max_entries = client->mount_args->max_readdir; 229 const int max_entries = fsc->mount_options->max_readdir;
238 const int max_bytes = client->mount_args->max_readdir_bytes; 230 const int max_bytes = fsc->mount_options->max_readdir_bytes;
239 231
240 dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off); 232 dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
241 if (fi->at_end) 233 if (fi->at_end)
@@ -267,17 +259,17 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
267 /* can we use the dcache? */ 259 /* can we use the dcache? */
268 spin_lock(&inode->i_lock); 260 spin_lock(&inode->i_lock);
269 if ((filp->f_pos == 2 || fi->dentry) && 261 if ((filp->f_pos == 2 || fi->dentry) &&
270 !ceph_test_opt(client, NOASYNCREADDIR) && 262 !ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
271 ceph_snap(inode) != CEPH_SNAPDIR && 263 ceph_snap(inode) != CEPH_SNAPDIR &&
272 (ci->i_ceph_flags & CEPH_I_COMPLETE) && 264 (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
273 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { 265 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
266 spin_unlock(&inode->i_lock);
274 err = __dcache_readdir(filp, dirent, filldir); 267 err = __dcache_readdir(filp, dirent, filldir);
275 if (err != -EAGAIN) { 268 if (err != -EAGAIN)
276 spin_unlock(&inode->i_lock);
277 return err; 269 return err;
278 } 270 } else {
271 spin_unlock(&inode->i_lock);
279 } 272 }
280 spin_unlock(&inode->i_lock);
281 if (fi->dentry) { 273 if (fi->dentry) {
282 err = note_last_dentry(fi, fi->dentry->d_name.name, 274 err = note_last_dentry(fi, fi->dentry->d_name.name,
283 fi->dentry->d_name.len); 275 fi->dentry->d_name.len);
@@ -487,14 +479,13 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
487struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, 479struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
488 struct dentry *dentry, int err) 480 struct dentry *dentry, int err)
489{ 481{
490 struct ceph_client *client = ceph_sb_to_client(dentry->d_sb); 482 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
491 struct inode *parent = dentry->d_parent->d_inode; 483 struct inode *parent = dentry->d_parent->d_inode;
492 484
493 /* .snap dir? */ 485 /* .snap dir? */
494 if (err == -ENOENT && 486 if (err == -ENOENT &&
495 ceph_vino(parent).ino != CEPH_INO_ROOT && /* no .snap in root dir */
496 strcmp(dentry->d_name.name, 487 strcmp(dentry->d_name.name,
497 client->mount_args->snapdir_name) == 0) { 488 fsc->mount_options->snapdir_name) == 0) {
498 struct inode *inode = ceph_get_snapdir(parent); 489 struct inode *inode = ceph_get_snapdir(parent);
499 dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n", 490 dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n",
500 dentry, dentry->d_name.len, dentry->d_name.name, inode); 491 dentry, dentry->d_name.len, dentry->d_name.name, inode);
@@ -539,8 +530,8 @@ static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
539static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, 530static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
540 struct nameidata *nd) 531 struct nameidata *nd)
541{ 532{
542 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 533 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
543 struct ceph_mds_client *mdsc = &client->mdsc; 534 struct ceph_mds_client *mdsc = fsc->mdsc;
544 struct ceph_mds_request *req; 535 struct ceph_mds_request *req;
545 int op; 536 int op;
546 int err; 537 int err;
@@ -572,7 +563,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
572 spin_lock(&dir->i_lock); 563 spin_lock(&dir->i_lock);
573 dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags); 564 dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
574 if (strncmp(dentry->d_name.name, 565 if (strncmp(dentry->d_name.name,
575 client->mount_args->snapdir_name, 566 fsc->mount_options->snapdir_name,
576 dentry->d_name.len) && 567 dentry->d_name.len) &&
577 !is_root_ceph_dentry(dir, dentry) && 568 !is_root_ceph_dentry(dir, dentry) &&
578 (ci->i_ceph_flags & CEPH_I_COMPLETE) && 569 (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
@@ -629,8 +620,8 @@ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
629static int ceph_mknod(struct inode *dir, struct dentry *dentry, 620static int ceph_mknod(struct inode *dir, struct dentry *dentry,
630 int mode, dev_t rdev) 621 int mode, dev_t rdev)
631{ 622{
632 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 623 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
633 struct ceph_mds_client *mdsc = &client->mdsc; 624 struct ceph_mds_client *mdsc = fsc->mdsc;
634 struct ceph_mds_request *req; 625 struct ceph_mds_request *req;
635 int err; 626 int err;
636 627
@@ -685,8 +676,8 @@ static int ceph_create(struct inode *dir, struct dentry *dentry, int mode,
685static int ceph_symlink(struct inode *dir, struct dentry *dentry, 676static int ceph_symlink(struct inode *dir, struct dentry *dentry,
686 const char *dest) 677 const char *dest)
687{ 678{
688 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 679 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
689 struct ceph_mds_client *mdsc = &client->mdsc; 680 struct ceph_mds_client *mdsc = fsc->mdsc;
690 struct ceph_mds_request *req; 681 struct ceph_mds_request *req;
691 int err; 682 int err;
692 683
@@ -716,8 +707,8 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry,
716 707
717static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode) 708static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode)
718{ 709{
719 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 710 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
720 struct ceph_mds_client *mdsc = &client->mdsc; 711 struct ceph_mds_client *mdsc = fsc->mdsc;
721 struct ceph_mds_request *req; 712 struct ceph_mds_request *req;
722 int err = -EROFS; 713 int err = -EROFS;
723 int op; 714 int op;
@@ -758,8 +749,8 @@ out:
758static int ceph_link(struct dentry *old_dentry, struct inode *dir, 749static int ceph_link(struct dentry *old_dentry, struct inode *dir,
759 struct dentry *dentry) 750 struct dentry *dentry)
760{ 751{
761 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 752 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
762 struct ceph_mds_client *mdsc = &client->mdsc; 753 struct ceph_mds_client *mdsc = fsc->mdsc;
763 struct ceph_mds_request *req; 754 struct ceph_mds_request *req;
764 int err; 755 int err;
765 756
@@ -813,8 +804,8 @@ static int drop_caps_for_unlink(struct inode *inode)
813 */ 804 */
814static int ceph_unlink(struct inode *dir, struct dentry *dentry) 805static int ceph_unlink(struct inode *dir, struct dentry *dentry)
815{ 806{
816 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 807 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
817 struct ceph_mds_client *mdsc = &client->mdsc; 808 struct ceph_mds_client *mdsc = fsc->mdsc;
818 struct inode *inode = dentry->d_inode; 809 struct inode *inode = dentry->d_inode;
819 struct ceph_mds_request *req; 810 struct ceph_mds_request *req;
820 int err = -EROFS; 811 int err = -EROFS;
@@ -854,8 +845,8 @@ out:
854static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, 845static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
855 struct inode *new_dir, struct dentry *new_dentry) 846 struct inode *new_dir, struct dentry *new_dentry)
856{ 847{
857 struct ceph_client *client = ceph_sb_to_client(old_dir->i_sb); 848 struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb);
858 struct ceph_mds_client *mdsc = &client->mdsc; 849 struct ceph_mds_client *mdsc = fsc->mdsc;
859 struct ceph_mds_request *req; 850 struct ceph_mds_request *req;
860 int err; 851 int err;
861 852
@@ -1076,7 +1067,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
1076 struct ceph_inode_info *ci = ceph_inode(inode); 1067 struct ceph_inode_info *ci = ceph_inode(inode);
1077 int left; 1068 int left;
1078 1069
1079 if (!ceph_test_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT)) 1070 if (!ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
1080 return -EISDIR; 1071 return -EISDIR;
1081 1072
1082 if (!cf->dir_info) { 1073 if (!cf->dir_info) {
@@ -1177,7 +1168,7 @@ void ceph_dentry_lru_add(struct dentry *dn)
1177 dout("dentry_lru_add %p %p '%.*s'\n", di, dn, 1168 dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
1178 dn->d_name.len, dn->d_name.name); 1169 dn->d_name.len, dn->d_name.name);
1179 if (di) { 1170 if (di) {
1180 mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc; 1171 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
1181 spin_lock(&mdsc->dentry_lru_lock); 1172 spin_lock(&mdsc->dentry_lru_lock);
1182 list_add_tail(&di->lru, &mdsc->dentry_lru); 1173 list_add_tail(&di->lru, &mdsc->dentry_lru);
1183 mdsc->num_dentry++; 1174 mdsc->num_dentry++;
@@ -1193,7 +1184,7 @@ void ceph_dentry_lru_touch(struct dentry *dn)
1193 dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn, 1184 dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn,
1194 dn->d_name.len, dn->d_name.name, di->offset); 1185 dn->d_name.len, dn->d_name.name, di->offset);
1195 if (di) { 1186 if (di) {
1196 mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc; 1187 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
1197 spin_lock(&mdsc->dentry_lru_lock); 1188 spin_lock(&mdsc->dentry_lru_lock);
1198 list_move_tail(&di->lru, &mdsc->dentry_lru); 1189 list_move_tail(&di->lru, &mdsc->dentry_lru);
1199 spin_unlock(&mdsc->dentry_lru_lock); 1190 spin_unlock(&mdsc->dentry_lru_lock);
@@ -1208,7 +1199,7 @@ void ceph_dentry_lru_del(struct dentry *dn)
1208 dout("dentry_lru_del %p %p '%.*s'\n", di, dn, 1199 dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
1209 dn->d_name.len, dn->d_name.name); 1200 dn->d_name.len, dn->d_name.name);
1210 if (di) { 1201 if (di) {
1211 mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc; 1202 mdsc = ceph_sb_to_client(dn->d_sb)->mdsc;
1212 spin_lock(&mdsc->dentry_lru_lock); 1203 spin_lock(&mdsc->dentry_lru_lock);
1213 list_del_init(&di->lru); 1204 list_del_init(&di->lru);
1214 mdsc->num_dentry--; 1205 mdsc->num_dentry--;
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index e38423e82f2e..2297d9426992 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -1,10 +1,11 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/exportfs.h> 3#include <linux/exportfs.h>
4#include <linux/slab.h> 4#include <linux/slab.h>
5#include <asm/unaligned.h> 5#include <asm/unaligned.h>
6 6
7#include "super.h" 7#include "super.h"
8#include "mds_client.h"
8 9
9/* 10/*
10 * NFS export support 11 * NFS export support
@@ -120,7 +121,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
120static struct dentry *__cfh_to_dentry(struct super_block *sb, 121static struct dentry *__cfh_to_dentry(struct super_block *sb,
121 struct ceph_nfs_confh *cfh) 122 struct ceph_nfs_confh *cfh)
122{ 123{
123 struct ceph_mds_client *mdsc = &ceph_sb_to_client(sb)->mdsc; 124 struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
124 struct inode *inode; 125 struct inode *inode;
125 struct dentry *dentry; 126 struct dentry *dentry;
126 struct ceph_vino vino; 127 struct ceph_vino vino;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 66e4da6dba22..e77c28cf3690 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1,5 +1,6 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/module.h>
3#include <linux/sched.h> 4#include <linux/sched.h>
4#include <linux/slab.h> 5#include <linux/slab.h>
5#include <linux/file.h> 6#include <linux/file.h>
@@ -38,8 +39,8 @@
38static struct ceph_mds_request * 39static struct ceph_mds_request *
39prepare_open_request(struct super_block *sb, int flags, int create_mode) 40prepare_open_request(struct super_block *sb, int flags, int create_mode)
40{ 41{
41 struct ceph_client *client = ceph_sb_to_client(sb); 42 struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
42 struct ceph_mds_client *mdsc = &client->mdsc; 43 struct ceph_mds_client *mdsc = fsc->mdsc;
43 struct ceph_mds_request *req; 44 struct ceph_mds_request *req;
44 int want_auth = USE_ANY_MDS; 45 int want_auth = USE_ANY_MDS;
45 int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN; 46 int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
@@ -117,8 +118,8 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
117int ceph_open(struct inode *inode, struct file *file) 118int ceph_open(struct inode *inode, struct file *file)
118{ 119{
119 struct ceph_inode_info *ci = ceph_inode(inode); 120 struct ceph_inode_info *ci = ceph_inode(inode);
120 struct ceph_client *client = ceph_sb_to_client(inode->i_sb); 121 struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
121 struct ceph_mds_client *mdsc = &client->mdsc; 122 struct ceph_mds_client *mdsc = fsc->mdsc;
122 struct ceph_mds_request *req; 123 struct ceph_mds_request *req;
123 struct ceph_file_info *cf = file->private_data; 124 struct ceph_file_info *cf = file->private_data;
124 struct inode *parent_inode = file->f_dentry->d_parent->d_inode; 125 struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
@@ -216,8 +217,8 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
216 struct nameidata *nd, int mode, 217 struct nameidata *nd, int mode,
217 int locked_dir) 218 int locked_dir)
218{ 219{
219 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 220 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
220 struct ceph_mds_client *mdsc = &client->mdsc; 221 struct ceph_mds_client *mdsc = fsc->mdsc;
221 struct file *file = nd->intent.open.file; 222 struct file *file = nd->intent.open.file;
222 struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry); 223 struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry);
223 struct ceph_mds_request *req; 224 struct ceph_mds_request *req;
@@ -270,163 +271,6 @@ int ceph_release(struct inode *inode, struct file *file)
270} 271}
271 272
272/* 273/*
273 * build a vector of user pages
274 */
275static struct page **get_direct_page_vector(const char __user *data,
276 int num_pages,
277 loff_t off, size_t len)
278{
279 struct page **pages;
280 int rc;
281
282 pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
283 if (!pages)
284 return ERR_PTR(-ENOMEM);
285
286 down_read(&current->mm->mmap_sem);
287 rc = get_user_pages(current, current->mm, (unsigned long)data,
288 num_pages, 0, 0, pages, NULL);
289 up_read(&current->mm->mmap_sem);
290 if (rc < 0)
291 goto fail;
292 return pages;
293
294fail:
295 kfree(pages);
296 return ERR_PTR(rc);
297}
298
299static void put_page_vector(struct page **pages, int num_pages)
300{
301 int i;
302
303 for (i = 0; i < num_pages; i++)
304 put_page(pages[i]);
305 kfree(pages);
306}
307
308void ceph_release_page_vector(struct page **pages, int num_pages)
309{
310 int i;
311
312 for (i = 0; i < num_pages; i++)
313 __free_pages(pages[i], 0);
314 kfree(pages);
315}
316
317/*
318 * allocate a vector new pages
319 */
320static struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags)
321{
322 struct page **pages;
323 int i;
324
325 pages = kmalloc(sizeof(*pages) * num_pages, flags);
326 if (!pages)
327 return ERR_PTR(-ENOMEM);
328 for (i = 0; i < num_pages; i++) {
329 pages[i] = __page_cache_alloc(flags);
330 if (pages[i] == NULL) {
331 ceph_release_page_vector(pages, i);
332 return ERR_PTR(-ENOMEM);
333 }
334 }
335 return pages;
336}
337
338/*
339 * copy user data into a page vector
340 */
341static int copy_user_to_page_vector(struct page **pages,
342 const char __user *data,
343 loff_t off, size_t len)
344{
345 int i = 0;
346 int po = off & ~PAGE_CACHE_MASK;
347 int left = len;
348 int l, bad;
349
350 while (left > 0) {
351 l = min_t(int, PAGE_CACHE_SIZE-po, left);
352 bad = copy_from_user(page_address(pages[i]) + po, data, l);
353 if (bad == l)
354 return -EFAULT;
355 data += l - bad;
356 left -= l - bad;
357 po += l - bad;
358 if (po == PAGE_CACHE_SIZE) {
359 po = 0;
360 i++;
361 }
362 }
363 return len;
364}
365
366/*
367 * copy user data from a page vector into a user pointer
368 */
369static int copy_page_vector_to_user(struct page **pages, char __user *data,
370 loff_t off, size_t len)
371{
372 int i = 0;
373 int po = off & ~PAGE_CACHE_MASK;
374 int left = len;
375 int l, bad;
376
377 while (left > 0) {
378 l = min_t(int, left, PAGE_CACHE_SIZE-po);
379 bad = copy_to_user(data, page_address(pages[i]) + po, l);
380 if (bad == l)
381 return -EFAULT;
382 data += l - bad;
383 left -= l - bad;
384 if (po) {
385 po += l - bad;
386 if (po == PAGE_CACHE_SIZE)
387 po = 0;
388 }
389 i++;
390 }
391 return len;
392}
393
394/*
395 * Zero an extent within a page vector. Offset is relative to the
396 * start of the first page.
397 */
398static void zero_page_vector_range(int off, int len, struct page **pages)
399{
400 int i = off >> PAGE_CACHE_SHIFT;
401
402 off &= ~PAGE_CACHE_MASK;
403
404 dout("zero_page_vector_page %u~%u\n", off, len);
405
406 /* leading partial page? */
407 if (off) {
408 int end = min((int)PAGE_CACHE_SIZE, off + len);
409 dout("zeroing %d %p head from %d\n", i, pages[i],
410 (int)off);
411 zero_user_segment(pages[i], off, end);
412 len -= (end - off);
413 i++;
414 }
415 while (len >= PAGE_CACHE_SIZE) {
416 dout("zeroing %d %p len=%d\n", i, pages[i], len);
417 zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
418 len -= PAGE_CACHE_SIZE;
419 i++;
420 }
421 /* trailing partial page? */
422 if (len) {
423 dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len);
424 zero_user_segment(pages[i], 0, len);
425 }
426}
427
428
429/*
430 * Read a range of bytes striped over one or more objects. Iterate over 274 * Read a range of bytes striped over one or more objects. Iterate over
431 * objects we stripe over. (That's not atomic, but good enough for now.) 275 * objects we stripe over. (That's not atomic, but good enough for now.)
432 * 276 *
@@ -438,7 +282,7 @@ static int striped_read(struct inode *inode,
438 struct page **pages, int num_pages, 282 struct page **pages, int num_pages,
439 int *checkeof) 283 int *checkeof)
440{ 284{
441 struct ceph_client *client = ceph_inode_to_client(inode); 285 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
442 struct ceph_inode_info *ci = ceph_inode(inode); 286 struct ceph_inode_info *ci = ceph_inode(inode);
443 u64 pos, this_len; 287 u64 pos, this_len;
444 int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */ 288 int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */
@@ -459,7 +303,7 @@ static int striped_read(struct inode *inode,
459 303
460more: 304more:
461 this_len = left; 305 this_len = left;
462 ret = ceph_osdc_readpages(&client->osdc, ceph_vino(inode), 306 ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
463 &ci->i_layout, pos, &this_len, 307 &ci->i_layout, pos, &this_len,
464 ci->i_truncate_seq, 308 ci->i_truncate_seq,
465 ci->i_truncate_size, 309 ci->i_truncate_size,
@@ -477,8 +321,8 @@ more:
477 321
478 if (read < pos - off) { 322 if (read < pos - off) {
479 dout(" zero gap %llu to %llu\n", off + read, pos); 323 dout(" zero gap %llu to %llu\n", off + read, pos);
480 zero_page_vector_range(page_off + read, 324 ceph_zero_page_vector_range(page_off + read,
481 pos - off - read, pages); 325 pos - off - read, pages);
482 } 326 }
483 pos += ret; 327 pos += ret;
484 read = pos - off; 328 read = pos - off;
@@ -495,8 +339,8 @@ more:
495 /* was original extent fully inside i_size? */ 339 /* was original extent fully inside i_size? */
496 if (pos + left <= inode->i_size) { 340 if (pos + left <= inode->i_size) {
497 dout("zero tail\n"); 341 dout("zero tail\n");
498 zero_page_vector_range(page_off + read, len - read, 342 ceph_zero_page_vector_range(page_off + read, len - read,
499 pages); 343 pages);
500 read = len; 344 read = len;
501 goto out; 345 goto out;
502 } 346 }
@@ -531,7 +375,7 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
531 (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); 375 (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
532 376
533 if (file->f_flags & O_DIRECT) { 377 if (file->f_flags & O_DIRECT) {
534 pages = get_direct_page_vector(data, num_pages, off, len); 378 pages = ceph_get_direct_page_vector(data, num_pages, off, len);
535 379
536 /* 380 /*
537 * flush any page cache pages in this range. this 381 * flush any page cache pages in this range. this
@@ -552,13 +396,13 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
552 ret = striped_read(inode, off, len, pages, num_pages, checkeof); 396 ret = striped_read(inode, off, len, pages, num_pages, checkeof);
553 397
554 if (ret >= 0 && (file->f_flags & O_DIRECT) == 0) 398 if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
555 ret = copy_page_vector_to_user(pages, data, off, ret); 399 ret = ceph_copy_page_vector_to_user(pages, data, off, ret);
556 if (ret >= 0) 400 if (ret >= 0)
557 *poff = off + ret; 401 *poff = off + ret;
558 402
559done: 403done:
560 if (file->f_flags & O_DIRECT) 404 if (file->f_flags & O_DIRECT)
561 put_page_vector(pages, num_pages); 405 ceph_put_page_vector(pages, num_pages);
562 else 406 else
563 ceph_release_page_vector(pages, num_pages); 407 ceph_release_page_vector(pages, num_pages);
564 dout("sync_read result %d\n", ret); 408 dout("sync_read result %d\n", ret);
@@ -594,7 +438,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
594{ 438{
595 struct inode *inode = file->f_dentry->d_inode; 439 struct inode *inode = file->f_dentry->d_inode;
596 struct ceph_inode_info *ci = ceph_inode(inode); 440 struct ceph_inode_info *ci = ceph_inode(inode);
597 struct ceph_client *client = ceph_inode_to_client(inode); 441 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
598 struct ceph_osd_request *req; 442 struct ceph_osd_request *req;
599 struct page **pages; 443 struct page **pages;
600 int num_pages; 444 int num_pages;
@@ -642,7 +486,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
642 */ 486 */
643more: 487more:
644 len = left; 488 len = left;
645 req = ceph_osdc_new_request(&client->osdc, &ci->i_layout, 489 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
646 ceph_vino(inode), pos, &len, 490 ceph_vino(inode), pos, &len,
647 CEPH_OSD_OP_WRITE, flags, 491 CEPH_OSD_OP_WRITE, flags,
648 ci->i_snap_realm->cached_context, 492 ci->i_snap_realm->cached_context,
@@ -655,7 +499,7 @@ more:
655 num_pages = calc_pages_for(pos, len); 499 num_pages = calc_pages_for(pos, len);
656 500
657 if (file->f_flags & O_DIRECT) { 501 if (file->f_flags & O_DIRECT) {
658 pages = get_direct_page_vector(data, num_pages, pos, len); 502 pages = ceph_get_direct_page_vector(data, num_pages, pos, len);
659 if (IS_ERR(pages)) { 503 if (IS_ERR(pages)) {
660 ret = PTR_ERR(pages); 504 ret = PTR_ERR(pages);
661 goto out; 505 goto out;
@@ -673,7 +517,7 @@ more:
673 ret = PTR_ERR(pages); 517 ret = PTR_ERR(pages);
674 goto out; 518 goto out;
675 } 519 }
676 ret = copy_user_to_page_vector(pages, data, pos, len); 520 ret = ceph_copy_user_to_page_vector(pages, data, pos, len);
677 if (ret < 0) { 521 if (ret < 0) {
678 ceph_release_page_vector(pages, num_pages); 522 ceph_release_page_vector(pages, num_pages);
679 goto out; 523 goto out;
@@ -689,7 +533,7 @@ more:
689 req->r_num_pages = num_pages; 533 req->r_num_pages = num_pages;
690 req->r_inode = inode; 534 req->r_inode = inode;
691 535
692 ret = ceph_osdc_start_request(&client->osdc, req, false); 536 ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
693 if (!ret) { 537 if (!ret) {
694 if (req->r_safe_callback) { 538 if (req->r_safe_callback) {
695 /* 539 /*
@@ -701,11 +545,11 @@ more:
701 spin_unlock(&ci->i_unsafe_lock); 545 spin_unlock(&ci->i_unsafe_lock);
702 ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR); 546 ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
703 } 547 }
704 ret = ceph_osdc_wait_request(&client->osdc, req); 548 ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
705 } 549 }
706 550
707 if (file->f_flags & O_DIRECT) 551 if (file->f_flags & O_DIRECT)
708 put_page_vector(pages, num_pages); 552 ceph_put_page_vector(pages, num_pages);
709 else if (file->f_flags & O_SYNC) 553 else if (file->f_flags & O_SYNC)
710 ceph_release_page_vector(pages, num_pages); 554 ceph_release_page_vector(pages, num_pages);
711 555
@@ -814,7 +658,8 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
814 struct ceph_file_info *fi = file->private_data; 658 struct ceph_file_info *fi = file->private_data;
815 struct inode *inode = file->f_dentry->d_inode; 659 struct inode *inode = file->f_dentry->d_inode;
816 struct ceph_inode_info *ci = ceph_inode(inode); 660 struct ceph_inode_info *ci = ceph_inode(inode);
817 struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc; 661 struct ceph_osd_client *osdc =
662 &ceph_sb_to_client(inode->i_sb)->client->osdc;
818 loff_t endoff = pos + iov->iov_len; 663 loff_t endoff = pos + iov->iov_len;
819 int want, got = 0; 664 int want, got = 0;
820 int ret, err; 665 int ret, err;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 62377ec37edf..1d6a45b5a04c 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1,4 +1,4 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/module.h> 3#include <linux/module.h>
4#include <linux/fs.h> 4#include <linux/fs.h>
@@ -13,7 +13,8 @@
13#include <linux/pagevec.h> 13#include <linux/pagevec.h>
14 14
15#include "super.h" 15#include "super.h"
16#include "decode.h" 16#include "mds_client.h"
17#include <linux/ceph/decode.h>
17 18
18/* 19/*
19 * Ceph inode operations 20 * Ceph inode operations
@@ -384,7 +385,7 @@ void ceph_destroy_inode(struct inode *inode)
384 */ 385 */
385 if (ci->i_snap_realm) { 386 if (ci->i_snap_realm) {
386 struct ceph_mds_client *mdsc = 387 struct ceph_mds_client *mdsc =
387 &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; 388 ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
388 struct ceph_snap_realm *realm = ci->i_snap_realm; 389 struct ceph_snap_realm *realm = ci->i_snap_realm;
389 390
390 dout(" dropping residual ref to snap realm %p\n", realm); 391 dout(" dropping residual ref to snap realm %p\n", realm);
@@ -685,7 +686,7 @@ static int fill_inode(struct inode *inode,
685 } 686 }
686 687
687 /* it may be better to set st_size in getattr instead? */ 688 /* it may be better to set st_size in getattr instead? */
688 if (ceph_test_opt(ceph_sb_to_client(inode->i_sb), RBYTES)) 689 if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), RBYTES))
689 inode->i_size = ci->i_rbytes; 690 inode->i_size = ci->i_rbytes;
690 break; 691 break;
691 default: 692 default:
@@ -901,7 +902,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
901 struct inode *in = NULL; 902 struct inode *in = NULL;
902 struct ceph_mds_reply_inode *ininfo; 903 struct ceph_mds_reply_inode *ininfo;
903 struct ceph_vino vino; 904 struct ceph_vino vino;
904 struct ceph_client *client = ceph_sb_to_client(sb); 905 struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
905 int i = 0; 906 int i = 0;
906 int err = 0; 907 int err = 0;
907 908
@@ -965,7 +966,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
965 */ 966 */
966 if (rinfo->head->is_dentry && !req->r_aborted && 967 if (rinfo->head->is_dentry && !req->r_aborted &&
967 (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name, 968 (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name,
968 client->mount_args->snapdir_name, 969 fsc->mount_options->snapdir_name,
969 req->r_dentry->d_name.len))) { 970 req->r_dentry->d_name.len))) {
970 /* 971 /*
971 * lookup link rename : null -> possibly existing inode 972 * lookup link rename : null -> possibly existing inode
@@ -1533,7 +1534,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
1533 struct inode *parent_inode = dentry->d_parent->d_inode; 1534 struct inode *parent_inode = dentry->d_parent->d_inode;
1534 const unsigned int ia_valid = attr->ia_valid; 1535 const unsigned int ia_valid = attr->ia_valid;
1535 struct ceph_mds_request *req; 1536 struct ceph_mds_request *req;
1536 struct ceph_mds_client *mdsc = &ceph_sb_to_client(dentry->d_sb)->mdsc; 1537 struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc;
1537 int issued; 1538 int issued;
1538 int release = 0, dirtied = 0; 1539 int release = 0, dirtied = 0;
1539 int mask = 0; 1540 int mask = 0;
@@ -1728,8 +1729,8 @@ out:
1728 */ 1729 */
1729int ceph_do_getattr(struct inode *inode, int mask) 1730int ceph_do_getattr(struct inode *inode, int mask)
1730{ 1731{
1731 struct ceph_client *client = ceph_sb_to_client(inode->i_sb); 1732 struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
1732 struct ceph_mds_client *mdsc = &client->mdsc; 1733 struct ceph_mds_client *mdsc = fsc->mdsc;
1733 struct ceph_mds_request *req; 1734 struct ceph_mds_request *req;
1734 int err; 1735 int err;
1735 1736
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 76e307d2aba1..8888c9ba68db 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -1,8 +1,10 @@
1#include <linux/in.h> 1#include <linux/in.h>
2 2
3#include "ioctl.h"
4#include "super.h" 3#include "super.h"
5#include "ceph_debug.h" 4#include "mds_client.h"
5#include <linux/ceph/ceph_debug.h>
6
7#include "ioctl.h"
6 8
7 9
8/* 10/*
@@ -37,7 +39,7 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
37{ 39{
38 struct inode *inode = file->f_dentry->d_inode; 40 struct inode *inode = file->f_dentry->d_inode;
39 struct inode *parent_inode = file->f_dentry->d_parent->d_inode; 41 struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
40 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; 42 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
41 struct ceph_mds_request *req; 43 struct ceph_mds_request *req;
42 struct ceph_ioctl_layout l; 44 struct ceph_ioctl_layout l;
43 int err, i; 45 int err, i;
@@ -90,6 +92,68 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
90} 92}
91 93
92/* 94/*
95 * Set a layout policy on a directory inode. All items in the tree
96 * rooted at this inode will inherit this layout on creation,
97 * (It doesn't apply retroactively )
98 * unless a subdirectory has its own layout policy.
99 */
100static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg)
101{
102 struct inode *inode = file->f_dentry->d_inode;
103 struct ceph_mds_request *req;
104 struct ceph_ioctl_layout l;
105 int err, i;
106 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
107
108 /* copy and validate */
109 if (copy_from_user(&l, arg, sizeof(l)))
110 return -EFAULT;
111
112 if ((l.object_size & ~PAGE_MASK) ||
113 (l.stripe_unit & ~PAGE_MASK) ||
114 !l.stripe_unit ||
115 (l.object_size &&
116 (unsigned)l.object_size % (unsigned)l.stripe_unit))
117 return -EINVAL;
118
119 /* make sure it's a valid data pool */
120 if (l.data_pool > 0) {
121 mutex_lock(&mdsc->mutex);
122 err = -EINVAL;
123 for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++)
124 if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) {
125 err = 0;
126 break;
127 }
128 mutex_unlock(&mdsc->mutex);
129 if (err)
130 return err;
131 }
132
133 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETDIRLAYOUT,
134 USE_AUTH_MDS);
135
136 if (IS_ERR(req))
137 return PTR_ERR(req);
138 req->r_inode = igrab(inode);
139
140 req->r_args.setlayout.layout.fl_stripe_unit =
141 cpu_to_le32(l.stripe_unit);
142 req->r_args.setlayout.layout.fl_stripe_count =
143 cpu_to_le32(l.stripe_count);
144 req->r_args.setlayout.layout.fl_object_size =
145 cpu_to_le32(l.object_size);
146 req->r_args.setlayout.layout.fl_pg_pool =
147 cpu_to_le32(l.data_pool);
148 req->r_args.setlayout.layout.fl_pg_preferred =
149 cpu_to_le32(l.preferred_osd);
150
151 err = ceph_mdsc_do_request(mdsc, inode, req);
152 ceph_mdsc_put_request(req);
153 return err;
154}
155
156/*
93 * Return object name, size/offset information, and location (OSD 157 * Return object name, size/offset information, and location (OSD
94 * number, network address) for a given file offset. 158 * number, network address) for a given file offset.
95 */ 159 */
@@ -98,7 +162,8 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
98 struct ceph_ioctl_dataloc dl; 162 struct ceph_ioctl_dataloc dl;
99 struct inode *inode = file->f_dentry->d_inode; 163 struct inode *inode = file->f_dentry->d_inode;
100 struct ceph_inode_info *ci = ceph_inode(inode); 164 struct ceph_inode_info *ci = ceph_inode(inode);
101 struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc; 165 struct ceph_osd_client *osdc =
166 &ceph_sb_to_client(inode->i_sb)->client->osdc;
102 u64 len = 1, olen; 167 u64 len = 1, olen;
103 u64 tmp; 168 u64 tmp;
104 struct ceph_object_layout ol; 169 struct ceph_object_layout ol;
@@ -174,11 +239,15 @@ long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
174 case CEPH_IOC_SET_LAYOUT: 239 case CEPH_IOC_SET_LAYOUT:
175 return ceph_ioctl_set_layout(file, (void __user *)arg); 240 return ceph_ioctl_set_layout(file, (void __user *)arg);
176 241
242 case CEPH_IOC_SET_LAYOUT_POLICY:
243 return ceph_ioctl_set_layout_policy(file, (void __user *)arg);
244
177 case CEPH_IOC_GET_DATALOC: 245 case CEPH_IOC_GET_DATALOC:
178 return ceph_ioctl_get_dataloc(file, (void __user *)arg); 246 return ceph_ioctl_get_dataloc(file, (void __user *)arg);
179 247
180 case CEPH_IOC_LAZYIO: 248 case CEPH_IOC_LAZYIO:
181 return ceph_ioctl_lazyio(file); 249 return ceph_ioctl_lazyio(file);
182 } 250 }
251
183 return -ENOTTY; 252 return -ENOTTY;
184} 253}
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
index 88451a3b6857..a6ce54e94eb5 100644
--- a/fs/ceph/ioctl.h
+++ b/fs/ceph/ioctl.h
@@ -4,7 +4,7 @@
4#include <linux/ioctl.h> 4#include <linux/ioctl.h>
5#include <linux/types.h> 5#include <linux/types.h>
6 6
7#define CEPH_IOCTL_MAGIC 0x97 7#define CEPH_IOCTL_MAGIC 0x98
8 8
9/* just use u64 to align sanely on all archs */ 9/* just use u64 to align sanely on all archs */
10struct ceph_ioctl_layout { 10struct ceph_ioctl_layout {
@@ -17,6 +17,8 @@ struct ceph_ioctl_layout {
17 struct ceph_ioctl_layout) 17 struct ceph_ioctl_layout)
18#define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2, \ 18#define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2, \
19 struct ceph_ioctl_layout) 19 struct ceph_ioctl_layout)
20#define CEPH_IOC_SET_LAYOUT_POLICY _IOW(CEPH_IOCTL_MAGIC, 5, \
21 struct ceph_ioctl_layout)
20 22
21/* 23/*
22 * Extract identity, address of the OSD and object storing a given 24 * Extract identity, address of the OSD and object storing a given
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index ff4e753aae92..40abde93c345 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -1,11 +1,11 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/file.h> 3#include <linux/file.h>
4#include <linux/namei.h> 4#include <linux/namei.h>
5 5
6#include "super.h" 6#include "super.h"
7#include "mds_client.h" 7#include "mds_client.h"
8#include "pagelist.h" 8#include <linux/ceph/pagelist.h>
9 9
10/** 10/**
11 * Implement fcntl and flock locking functions. 11 * Implement fcntl and flock locking functions.
@@ -16,7 +16,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
16{ 16{
17 struct inode *inode = file->f_dentry->d_inode; 17 struct inode *inode = file->f_dentry->d_inode;
18 struct ceph_mds_client *mdsc = 18 struct ceph_mds_client *mdsc =
19 &ceph_sb_to_client(inode->i_sb)->mdsc; 19 ceph_sb_to_client(inode->i_sb)->mdsc;
20 struct ceph_mds_request *req; 20 struct ceph_mds_request *req;
21 int err; 21 int err;
22 22
@@ -181,8 +181,9 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
181 * Encode the flock and fcntl locks for the given inode into the pagelist. 181 * Encode the flock and fcntl locks for the given inode into the pagelist.
182 * Format is: #fcntl locks, sequential fcntl locks, #flock locks, 182 * Format is: #fcntl locks, sequential fcntl locks, #flock locks,
183 * sequential flock locks. 183 * sequential flock locks.
184 * Must be called with BLK already held, and the lock numbers should have 184 * Must be called with lock_flocks() already held.
185 * been gathered under the same lock holding window. 185 * If we encounter more of a specific lock type than expected,
186 * we return the value 1.
186 */ 187 */
187int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist, 188int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
188 int num_fcntl_locks, int num_flock_locks) 189 int num_fcntl_locks, int num_flock_locks)
@@ -190,6 +191,8 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
190 struct file_lock *lock; 191 struct file_lock *lock;
191 struct ceph_filelock cephlock; 192 struct ceph_filelock cephlock;
192 int err = 0; 193 int err = 0;
194 int seen_fcntl = 0;
195 int seen_flock = 0;
193 196
194 dout("encoding %d flock and %d fcntl locks", num_flock_locks, 197 dout("encoding %d flock and %d fcntl locks", num_flock_locks,
195 num_fcntl_locks); 198 num_fcntl_locks);
@@ -198,6 +201,11 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
198 goto fail; 201 goto fail;
199 for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { 202 for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
200 if (lock->fl_flags & FL_POSIX) { 203 if (lock->fl_flags & FL_POSIX) {
204 ++seen_fcntl;
205 if (seen_fcntl > num_fcntl_locks) {
206 err = -ENOSPC;
207 goto fail;
208 }
201 err = lock_to_ceph_filelock(lock, &cephlock); 209 err = lock_to_ceph_filelock(lock, &cephlock);
202 if (err) 210 if (err)
203 goto fail; 211 goto fail;
@@ -213,6 +221,11 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist,
213 goto fail; 221 goto fail;
214 for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { 222 for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
215 if (lock->fl_flags & FL_FLOCK) { 223 if (lock->fl_flags & FL_FLOCK) {
224 ++seen_flock;
225 if (seen_flock > num_flock_locks) {
226 err = -ENOSPC;
227 goto fail;
228 }
216 err = lock_to_ceph_filelock(lock, &cephlock); 229 err = lock_to_ceph_filelock(lock, &cephlock);
217 if (err) 230 if (err)
218 goto fail; 231 goto fail;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index fad95f8f2608..3142b15940c2 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1,17 +1,21 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/fs.h>
3#include <linux/wait.h> 4#include <linux/wait.h>
4#include <linux/slab.h> 5#include <linux/slab.h>
5#include <linux/sched.h> 6#include <linux/sched.h>
7#include <linux/debugfs.h>
8#include <linux/seq_file.h>
6#include <linux/smp_lock.h> 9#include <linux/smp_lock.h>
7 10
8#include "mds_client.h"
9#include "mon_client.h"
10#include "super.h" 11#include "super.h"
11#include "messenger.h" 12#include "mds_client.h"
12#include "decode.h" 13
13#include "auth.h" 14#include <linux/ceph/messenger.h>
14#include "pagelist.h" 15#include <linux/ceph/decode.h>
16#include <linux/ceph/pagelist.h>
17#include <linux/ceph/auth.h>
18#include <linux/ceph/debugfs.h>
15 19
16/* 20/*
17 * A cluster of MDS (metadata server) daemons is responsible for 21 * A cluster of MDS (metadata server) daemons is responsible for
@@ -286,8 +290,9 @@ void ceph_put_mds_session(struct ceph_mds_session *s)
286 atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1); 290 atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
287 if (atomic_dec_and_test(&s->s_ref)) { 291 if (atomic_dec_and_test(&s->s_ref)) {
288 if (s->s_authorizer) 292 if (s->s_authorizer)
289 s->s_mdsc->client->monc.auth->ops->destroy_authorizer( 293 s->s_mdsc->fsc->client->monc.auth->ops->destroy_authorizer(
290 s->s_mdsc->client->monc.auth, s->s_authorizer); 294 s->s_mdsc->fsc->client->monc.auth,
295 s->s_authorizer);
291 kfree(s); 296 kfree(s);
292 } 297 }
293} 298}
@@ -344,7 +349,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
344 s->s_seq = 0; 349 s->s_seq = 0;
345 mutex_init(&s->s_mutex); 350 mutex_init(&s->s_mutex);
346 351
347 ceph_con_init(mdsc->client->msgr, &s->s_con); 352 ceph_con_init(mdsc->fsc->client->msgr, &s->s_con);
348 s->s_con.private = s; 353 s->s_con.private = s;
349 s->s_con.ops = &mds_con_ops; 354 s->s_con.ops = &mds_con_ops;
350 s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS; 355 s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS;
@@ -599,7 +604,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
599 } else if (req->r_dentry) { 604 } else if (req->r_dentry) {
600 struct inode *dir = req->r_dentry->d_parent->d_inode; 605 struct inode *dir = req->r_dentry->d_parent->d_inode;
601 606
602 if (dir->i_sb != mdsc->client->sb) { 607 if (dir->i_sb != mdsc->fsc->sb) {
603 /* not this fs! */ 608 /* not this fs! */
604 inode = req->r_dentry->d_inode; 609 inode = req->r_dentry->d_inode;
605 } else if (ceph_snap(dir) != CEPH_NOSNAP) { 610 } else if (ceph_snap(dir) != CEPH_NOSNAP) {
@@ -884,7 +889,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
884 __ceph_remove_cap(cap); 889 __ceph_remove_cap(cap);
885 if (!__ceph_is_any_real_caps(ci)) { 890 if (!__ceph_is_any_real_caps(ci)) {
886 struct ceph_mds_client *mdsc = 891 struct ceph_mds_client *mdsc =
887 &ceph_sb_to_client(inode->i_sb)->mdsc; 892 ceph_sb_to_client(inode->i_sb)->mdsc;
888 893
889 spin_lock(&mdsc->cap_dirty_lock); 894 spin_lock(&mdsc->cap_dirty_lock);
890 if (!list_empty(&ci->i_dirty_item)) { 895 if (!list_empty(&ci->i_dirty_item)) {
@@ -1146,7 +1151,7 @@ int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
1146 struct ceph_msg *msg, *partial = NULL; 1151 struct ceph_msg *msg, *partial = NULL;
1147 struct ceph_mds_cap_release *head; 1152 struct ceph_mds_cap_release *head;
1148 int err = -ENOMEM; 1153 int err = -ENOMEM;
1149 int extra = mdsc->client->mount_args->cap_release_safety; 1154 int extra = mdsc->fsc->mount_options->cap_release_safety;
1150 int num; 1155 int num;
1151 1156
1152 dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds, 1157 dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds,
@@ -2085,7 +2090,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
2085 2090
2086 /* insert trace into our cache */ 2091 /* insert trace into our cache */
2087 mutex_lock(&req->r_fill_mutex); 2092 mutex_lock(&req->r_fill_mutex);
2088 err = ceph_fill_trace(mdsc->client->sb, req, req->r_session); 2093 err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
2089 if (err == 0) { 2094 if (err == 0) {
2090 if (result == 0 && rinfo->dir_nr) 2095 if (result == 0 && rinfo->dir_nr)
2091 ceph_readdir_prepopulate(req, req->r_session); 2096 ceph_readdir_prepopulate(req, req->r_session);
@@ -2361,19 +2366,35 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
2361 2366
2362 if (recon_state->flock) { 2367 if (recon_state->flock) {
2363 int num_fcntl_locks, num_flock_locks; 2368 int num_fcntl_locks, num_flock_locks;
2364 2369 struct ceph_pagelist_cursor trunc_point;
2365 lock_kernel(); 2370
2366 ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks); 2371 ceph_pagelist_set_cursor(pagelist, &trunc_point);
2367 rec.v2.flock_len = (2*sizeof(u32) + 2372 do {
2368 (num_fcntl_locks+num_flock_locks) * 2373 lock_flocks();
2369 sizeof(struct ceph_filelock)); 2374 ceph_count_locks(inode, &num_fcntl_locks,
2370 2375 &num_flock_locks);
2371 err = ceph_pagelist_append(pagelist, &rec, reclen); 2376 rec.v2.flock_len = (2*sizeof(u32) +
2372 if (!err) 2377 (num_fcntl_locks+num_flock_locks) *
2373 err = ceph_encode_locks(inode, pagelist, 2378 sizeof(struct ceph_filelock));
2374 num_fcntl_locks, 2379 unlock_flocks();
2375 num_flock_locks); 2380
2376 unlock_kernel(); 2381 /* pre-alloc pagelist */
2382 ceph_pagelist_truncate(pagelist, &trunc_point);
2383 err = ceph_pagelist_append(pagelist, &rec, reclen);
2384 if (!err)
2385 err = ceph_pagelist_reserve(pagelist,
2386 rec.v2.flock_len);
2387
2388 /* encode locks */
2389 if (!err) {
2390 lock_flocks();
2391 err = ceph_encode_locks(inode,
2392 pagelist,
2393 num_fcntl_locks,
2394 num_flock_locks);
2395 unlock_flocks();
2396 }
2397 } while (err == -ENOSPC);
2377 } else { 2398 } else {
2378 err = ceph_pagelist_append(pagelist, &rec, reclen); 2399 err = ceph_pagelist_append(pagelist, &rec, reclen);
2379 } 2400 }
@@ -2613,7 +2634,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
2613 struct ceph_mds_session *session, 2634 struct ceph_mds_session *session,
2614 struct ceph_msg *msg) 2635 struct ceph_msg *msg)
2615{ 2636{
2616 struct super_block *sb = mdsc->client->sb; 2637 struct super_block *sb = mdsc->fsc->sb;
2617 struct inode *inode; 2638 struct inode *inode;
2618 struct ceph_inode_info *ci; 2639 struct ceph_inode_info *ci;
2619 struct dentry *parent, *dentry; 2640 struct dentry *parent, *dentry;
@@ -2891,10 +2912,16 @@ static void delayed_work(struct work_struct *work)
2891 schedule_delayed(mdsc); 2912 schedule_delayed(mdsc);
2892} 2913}
2893 2914
2915int ceph_mdsc_init(struct ceph_fs_client *fsc)
2894 2916
2895int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
2896{ 2917{
2897 mdsc->client = client; 2918 struct ceph_mds_client *mdsc;
2919
2920 mdsc = kzalloc(sizeof(struct ceph_mds_client), GFP_NOFS);
2921 if (!mdsc)
2922 return -ENOMEM;
2923 mdsc->fsc = fsc;
2924 fsc->mdsc = mdsc;
2898 mutex_init(&mdsc->mutex); 2925 mutex_init(&mdsc->mutex);
2899 mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS); 2926 mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
2900 if (mdsc->mdsmap == NULL) 2927 if (mdsc->mdsmap == NULL)
@@ -2927,7 +2954,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
2927 INIT_LIST_HEAD(&mdsc->dentry_lru); 2954 INIT_LIST_HEAD(&mdsc->dentry_lru);
2928 2955
2929 ceph_caps_init(mdsc); 2956 ceph_caps_init(mdsc);
2930 ceph_adjust_min_caps(mdsc, client->min_caps); 2957 ceph_adjust_min_caps(mdsc, fsc->min_caps);
2931 2958
2932 return 0; 2959 return 0;
2933} 2960}
@@ -2939,7 +2966,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
2939static void wait_requests(struct ceph_mds_client *mdsc) 2966static void wait_requests(struct ceph_mds_client *mdsc)
2940{ 2967{
2941 struct ceph_mds_request *req; 2968 struct ceph_mds_request *req;
2942 struct ceph_client *client = mdsc->client; 2969 struct ceph_fs_client *fsc = mdsc->fsc;
2943 2970
2944 mutex_lock(&mdsc->mutex); 2971 mutex_lock(&mdsc->mutex);
2945 if (__get_oldest_req(mdsc)) { 2972 if (__get_oldest_req(mdsc)) {
@@ -2947,7 +2974,7 @@ static void wait_requests(struct ceph_mds_client *mdsc)
2947 2974
2948 dout("wait_requests waiting for requests\n"); 2975 dout("wait_requests waiting for requests\n");
2949 wait_for_completion_timeout(&mdsc->safe_umount_waiters, 2976 wait_for_completion_timeout(&mdsc->safe_umount_waiters,
2950 client->mount_args->mount_timeout * HZ); 2977 fsc->client->options->mount_timeout * HZ);
2951 2978
2952 /* tear down remaining requests */ 2979 /* tear down remaining requests */
2953 mutex_lock(&mdsc->mutex); 2980 mutex_lock(&mdsc->mutex);
@@ -3030,7 +3057,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
3030{ 3057{
3031 u64 want_tid, want_flush; 3058 u64 want_tid, want_flush;
3032 3059
3033 if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN) 3060 if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
3034 return; 3061 return;
3035 3062
3036 dout("sync\n"); 3063 dout("sync\n");
@@ -3053,7 +3080,7 @@ bool done_closing_sessions(struct ceph_mds_client *mdsc)
3053{ 3080{
3054 int i, n = 0; 3081 int i, n = 0;
3055 3082
3056 if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN) 3083 if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
3057 return true; 3084 return true;
3058 3085
3059 mutex_lock(&mdsc->mutex); 3086 mutex_lock(&mdsc->mutex);
@@ -3071,8 +3098,8 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
3071{ 3098{
3072 struct ceph_mds_session *session; 3099 struct ceph_mds_session *session;
3073 int i; 3100 int i;
3074 struct ceph_client *client = mdsc->client; 3101 struct ceph_fs_client *fsc = mdsc->fsc;
3075 unsigned long timeout = client->mount_args->mount_timeout * HZ; 3102 unsigned long timeout = fsc->client->options->mount_timeout * HZ;
3076 3103
3077 dout("close_sessions\n"); 3104 dout("close_sessions\n");
3078 3105
@@ -3119,7 +3146,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
3119 dout("stopped\n"); 3146 dout("stopped\n");
3120} 3147}
3121 3148
3122void ceph_mdsc_stop(struct ceph_mds_client *mdsc) 3149static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
3123{ 3150{
3124 dout("stop\n"); 3151 dout("stop\n");
3125 cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */ 3152 cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
@@ -3129,6 +3156,15 @@ void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
3129 ceph_caps_finalize(mdsc); 3156 ceph_caps_finalize(mdsc);
3130} 3157}
3131 3158
3159void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
3160{
3161 struct ceph_mds_client *mdsc = fsc->mdsc;
3162
3163 ceph_mdsc_stop(mdsc);
3164 fsc->mdsc = NULL;
3165 kfree(mdsc);
3166}
3167
3132 3168
3133/* 3169/*
3134 * handle mds map update. 3170 * handle mds map update.
@@ -3145,14 +3181,14 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
3145 3181
3146 ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad); 3182 ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad);
3147 ceph_decode_copy(&p, &fsid, sizeof(fsid)); 3183 ceph_decode_copy(&p, &fsid, sizeof(fsid));
3148 if (ceph_check_fsid(mdsc->client, &fsid) < 0) 3184 if (ceph_check_fsid(mdsc->fsc->client, &fsid) < 0)
3149 return; 3185 return;
3150 epoch = ceph_decode_32(&p); 3186 epoch = ceph_decode_32(&p);
3151 maplen = ceph_decode_32(&p); 3187 maplen = ceph_decode_32(&p);
3152 dout("handle_map epoch %u len %d\n", epoch, (int)maplen); 3188 dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
3153 3189
3154 /* do we need it? */ 3190 /* do we need it? */
3155 ceph_monc_got_mdsmap(&mdsc->client->monc, epoch); 3191 ceph_monc_got_mdsmap(&mdsc->fsc->client->monc, epoch);
3156 mutex_lock(&mdsc->mutex); 3192 mutex_lock(&mdsc->mutex);
3157 if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) { 3193 if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
3158 dout("handle_map epoch %u <= our %u\n", 3194 dout("handle_map epoch %u <= our %u\n",
@@ -3176,7 +3212,7 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
3176 } else { 3212 } else {
3177 mdsc->mdsmap = newmap; /* first mds map */ 3213 mdsc->mdsmap = newmap; /* first mds map */
3178 } 3214 }
3179 mdsc->client->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size; 3215 mdsc->fsc->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
3180 3216
3181 __wake_requests(mdsc, &mdsc->waiting_for_map); 3217 __wake_requests(mdsc, &mdsc->waiting_for_map);
3182 3218
@@ -3277,7 +3313,7 @@ static int get_authorizer(struct ceph_connection *con,
3277{ 3313{
3278 struct ceph_mds_session *s = con->private; 3314 struct ceph_mds_session *s = con->private;
3279 struct ceph_mds_client *mdsc = s->s_mdsc; 3315 struct ceph_mds_client *mdsc = s->s_mdsc;
3280 struct ceph_auth_client *ac = mdsc->client->monc.auth; 3316 struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
3281 int ret = 0; 3317 int ret = 0;
3282 3318
3283 if (force_new && s->s_authorizer) { 3319 if (force_new && s->s_authorizer) {
@@ -3311,7 +3347,7 @@ static int verify_authorizer_reply(struct ceph_connection *con, int len)
3311{ 3347{
3312 struct ceph_mds_session *s = con->private; 3348 struct ceph_mds_session *s = con->private;
3313 struct ceph_mds_client *mdsc = s->s_mdsc; 3349 struct ceph_mds_client *mdsc = s->s_mdsc;
3314 struct ceph_auth_client *ac = mdsc->client->monc.auth; 3350 struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
3315 3351
3316 return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len); 3352 return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len);
3317} 3353}
@@ -3320,12 +3356,12 @@ static int invalidate_authorizer(struct ceph_connection *con)
3320{ 3356{
3321 struct ceph_mds_session *s = con->private; 3357 struct ceph_mds_session *s = con->private;
3322 struct ceph_mds_client *mdsc = s->s_mdsc; 3358 struct ceph_mds_client *mdsc = s->s_mdsc;
3323 struct ceph_auth_client *ac = mdsc->client->monc.auth; 3359 struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
3324 3360
3325 if (ac->ops->invalidate_authorizer) 3361 if (ac->ops->invalidate_authorizer)
3326 ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS); 3362 ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
3327 3363
3328 return ceph_monc_validate_auth(&mdsc->client->monc); 3364 return ceph_monc_validate_auth(&mdsc->fsc->client->monc);
3329} 3365}
3330 3366
3331static const struct ceph_connection_operations mds_con_ops = { 3367static const struct ceph_connection_operations mds_con_ops = {
@@ -3338,7 +3374,4 @@ static const struct ceph_connection_operations mds_con_ops = {
3338 .peer_reset = peer_reset, 3374 .peer_reset = peer_reset,
3339}; 3375};
3340 3376
3341
3342
3343
3344/* eof */ 3377/* eof */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index c98267ce6d2a..d66d63c72355 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -8,9 +8,9 @@
8#include <linux/rbtree.h> 8#include <linux/rbtree.h>
9#include <linux/spinlock.h> 9#include <linux/spinlock.h>
10 10
11#include "types.h" 11#include <linux/ceph/types.h>
12#include "messenger.h" 12#include <linux/ceph/messenger.h>
13#include "mdsmap.h" 13#include <linux/ceph/mdsmap.h>
14 14
15/* 15/*
16 * Some lock dependencies: 16 * Some lock dependencies:
@@ -26,7 +26,7 @@
26 * 26 *
27 */ 27 */
28 28
29struct ceph_client; 29struct ceph_fs_client;
30struct ceph_cap; 30struct ceph_cap;
31 31
32/* 32/*
@@ -230,7 +230,7 @@ struct ceph_mds_request {
230 * mds client state 230 * mds client state
231 */ 231 */
232struct ceph_mds_client { 232struct ceph_mds_client {
233 struct ceph_client *client; 233 struct ceph_fs_client *fsc;
234 struct mutex mutex; /* all nested structures */ 234 struct mutex mutex; /* all nested structures */
235 235
236 struct ceph_mdsmap *mdsmap; 236 struct ceph_mdsmap *mdsmap;
@@ -289,11 +289,6 @@ struct ceph_mds_client {
289 int caps_avail_count; /* unused, unreserved */ 289 int caps_avail_count; /* unused, unreserved */
290 int caps_min_count; /* keep at least this many 290 int caps_min_count; /* keep at least this many
291 (unreserved) */ 291 (unreserved) */
292
293#ifdef CONFIG_DEBUG_FS
294 struct dentry *debugfs_file;
295#endif
296
297 spinlock_t dentry_lru_lock; 292 spinlock_t dentry_lru_lock;
298 struct list_head dentry_lru; 293 struct list_head dentry_lru;
299 int num_dentry; 294 int num_dentry;
@@ -316,10 +311,9 @@ extern void ceph_put_mds_session(struct ceph_mds_session *s);
316extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc, 311extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
317 struct ceph_msg *msg, int mds); 312 struct ceph_msg *msg, int mds);
318 313
319extern int ceph_mdsc_init(struct ceph_mds_client *mdsc, 314extern int ceph_mdsc_init(struct ceph_fs_client *fsc);
320 struct ceph_client *client);
321extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc); 315extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
322extern void ceph_mdsc_stop(struct ceph_mds_client *mdsc); 316extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc);
323 317
324extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc); 318extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
325 319
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 040be6d1150b..73b7d44e8a35 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -1,4 +1,4 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/bug.h> 3#include <linux/bug.h>
4#include <linux/err.h> 4#include <linux/err.h>
@@ -6,9 +6,9 @@
6#include <linux/slab.h> 6#include <linux/slab.h>
7#include <linux/types.h> 7#include <linux/types.h>
8 8
9#include "mdsmap.h" 9#include <linux/ceph/mdsmap.h>
10#include "messenger.h" 10#include <linux/ceph/messenger.h>
11#include "decode.h" 11#include <linux/ceph/decode.h>
12 12
13#include "super.h" 13#include "super.h"
14 14
@@ -117,7 +117,8 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
117 } 117 }
118 118
119 dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n", 119 dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
120 i+1, n, global_id, mds, inc, pr_addr(&addr.in_addr), 120 i+1, n, global_id, mds, inc,
121 ceph_pr_addr(&addr.in_addr),
121 ceph_mds_state_name(state)); 122 ceph_mds_state_name(state));
122 if (mds >= 0 && mds < m->m_max_mds && state > 0) { 123 if (mds >= 0 && mds < m->m_max_mds && state > 0) {
123 m->m_info[mds].global_id = global_id; 124 m->m_info[mds].global_id = global_id;
diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c
deleted file mode 100644
index 46a368b6dce5..000000000000
--- a/fs/ceph/pagelist.c
+++ /dev/null
@@ -1,63 +0,0 @@
1
2#include <linux/gfp.h>
3#include <linux/pagemap.h>
4#include <linux/highmem.h>
5
6#include "pagelist.h"
7
8static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl)
9{
10 struct page *page = list_entry(pl->head.prev, struct page,
11 lru);
12 kunmap(page);
13}
14
15int ceph_pagelist_release(struct ceph_pagelist *pl)
16{
17 if (pl->mapped_tail)
18 ceph_pagelist_unmap_tail(pl);
19
20 while (!list_empty(&pl->head)) {
21 struct page *page = list_first_entry(&pl->head, struct page,
22 lru);
23 list_del(&page->lru);
24 __free_page(page);
25 }
26 return 0;
27}
28
29static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
30{
31 struct page *page = __page_cache_alloc(GFP_NOFS);
32 if (!page)
33 return -ENOMEM;
34 pl->room += PAGE_SIZE;
35 list_add_tail(&page->lru, &pl->head);
36 if (pl->mapped_tail)
37 ceph_pagelist_unmap_tail(pl);
38 pl->mapped_tail = kmap(page);
39 return 0;
40}
41
42int ceph_pagelist_append(struct ceph_pagelist *pl, void *buf, size_t len)
43{
44 while (pl->room < len) {
45 size_t bit = pl->room;
46 int ret;
47
48 memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK),
49 buf, bit);
50 pl->length += bit;
51 pl->room -= bit;
52 buf += bit;
53 len -= bit;
54 ret = ceph_pagelist_addpage(pl);
55 if (ret)
56 return ret;
57 }
58
59 memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len);
60 pl->length += len;
61 pl->room -= len;
62 return 0;
63}
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 190b6c4a6f2b..39c243acd062 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -1,10 +1,12 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/sort.h> 3#include <linux/sort.h>
4#include <linux/slab.h> 4#include <linux/slab.h>
5 5
6#include "super.h" 6#include "super.h"
7#include "decode.h" 7#include "mds_client.h"
8
9#include <linux/ceph/decode.h>
8 10
9/* 11/*
10 * Snapshots in ceph are driven in large part by cooperation from the 12 * Snapshots in ceph are driven in large part by cooperation from the
@@ -526,7 +528,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
526 struct ceph_cap_snap *capsnap) 528 struct ceph_cap_snap *capsnap)
527{ 529{
528 struct inode *inode = &ci->vfs_inode; 530 struct inode *inode = &ci->vfs_inode;
529 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc; 531 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
530 532
531 BUG_ON(capsnap->writing); 533 BUG_ON(capsnap->writing);
532 capsnap->size = inode->i_size; 534 capsnap->size = inode->i_size;
@@ -747,7 +749,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
747 struct ceph_mds_session *session, 749 struct ceph_mds_session *session,
748 struct ceph_msg *msg) 750 struct ceph_msg *msg)
749{ 751{
750 struct super_block *sb = mdsc->client->sb; 752 struct super_block *sb = mdsc->fsc->sb;
751 int mds = session->s_mds; 753 int mds = session->s_mds;
752 u64 split; 754 u64 split;
753 int op; 755 int op;
diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/strings.c
index c6179d3a26a2..cd5097d7c804 100644
--- a/fs/ceph/ceph_strings.c
+++ b/fs/ceph/strings.c
@@ -1,71 +1,9 @@
1/* 1/*
2 * Ceph string constants 2 * Ceph fs string constants
3 */ 3 */
4#include "types.h" 4#include <linux/module.h>
5#include <linux/ceph/types.h>
5 6
6const char *ceph_entity_type_name(int type)
7{
8 switch (type) {
9 case CEPH_ENTITY_TYPE_MDS: return "mds";
10 case CEPH_ENTITY_TYPE_OSD: return "osd";
11 case CEPH_ENTITY_TYPE_MON: return "mon";
12 case CEPH_ENTITY_TYPE_CLIENT: return "client";
13 case CEPH_ENTITY_TYPE_AUTH: return "auth";
14 default: return "unknown";
15 }
16}
17
18const char *ceph_osd_op_name(int op)
19{
20 switch (op) {
21 case CEPH_OSD_OP_READ: return "read";
22 case CEPH_OSD_OP_STAT: return "stat";
23
24 case CEPH_OSD_OP_MASKTRUNC: return "masktrunc";
25
26 case CEPH_OSD_OP_WRITE: return "write";
27 case CEPH_OSD_OP_DELETE: return "delete";
28 case CEPH_OSD_OP_TRUNCATE: return "truncate";
29 case CEPH_OSD_OP_ZERO: return "zero";
30 case CEPH_OSD_OP_WRITEFULL: return "writefull";
31 case CEPH_OSD_OP_ROLLBACK: return "rollback";
32
33 case CEPH_OSD_OP_APPEND: return "append";
34 case CEPH_OSD_OP_STARTSYNC: return "startsync";
35 case CEPH_OSD_OP_SETTRUNC: return "settrunc";
36 case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc";
37
38 case CEPH_OSD_OP_TMAPUP: return "tmapup";
39 case CEPH_OSD_OP_TMAPGET: return "tmapget";
40 case CEPH_OSD_OP_TMAPPUT: return "tmapput";
41
42 case CEPH_OSD_OP_GETXATTR: return "getxattr";
43 case CEPH_OSD_OP_GETXATTRS: return "getxattrs";
44 case CEPH_OSD_OP_SETXATTR: return "setxattr";
45 case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
46 case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
47 case CEPH_OSD_OP_RMXATTR: return "rmxattr";
48 case CEPH_OSD_OP_CMPXATTR: return "cmpxattr";
49
50 case CEPH_OSD_OP_PULL: return "pull";
51 case CEPH_OSD_OP_PUSH: return "push";
52 case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";
53 case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";
54 case CEPH_OSD_OP_SCRUB: return "scrub";
55
56 case CEPH_OSD_OP_WRLOCK: return "wrlock";
57 case CEPH_OSD_OP_WRUNLOCK: return "wrunlock";
58 case CEPH_OSD_OP_RDLOCK: return "rdlock";
59 case CEPH_OSD_OP_RDUNLOCK: return "rdunlock";
60 case CEPH_OSD_OP_UPLOCK: return "uplock";
61 case CEPH_OSD_OP_DNLOCK: return "dnlock";
62
63 case CEPH_OSD_OP_CALL: return "call";
64
65 case CEPH_OSD_OP_PGLS: return "pgls";
66 }
67 return "???";
68}
69 7
70const char *ceph_mds_state_name(int s) 8const char *ceph_mds_state_name(int s)
71{ 9{
@@ -177,17 +115,3 @@ const char *ceph_snap_op_name(int o)
177 } 115 }
178 return "???"; 116 return "???";
179} 117}
180
181const char *ceph_pool_op_name(int op)
182{
183 switch (op) {
184 case POOL_OP_CREATE: return "create";
185 case POOL_OP_DELETE: return "delete";
186 case POOL_OP_AUID_CHANGE: return "auid change";
187 case POOL_OP_CREATE_SNAP: return "create snap";
188 case POOL_OP_DELETE_SNAP: return "delete snap";
189 case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap";
190 case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap";
191 }
192 return "???";
193}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 9922628532b2..d6e0e0421891 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -1,5 +1,5 @@
1 1
2#include "ceph_debug.h" 2#include <linux/ceph/ceph_debug.h>
3 3
4#include <linux/backing-dev.h> 4#include <linux/backing-dev.h>
5#include <linux/ctype.h> 5#include <linux/ctype.h>
@@ -15,10 +15,13 @@
15#include <linux/statfs.h> 15#include <linux/statfs.h>
16#include <linux/string.h> 16#include <linux/string.h>
17 17
18#include "decode.h"
19#include "super.h" 18#include "super.h"
20#include "mon_client.h" 19#include "mds_client.h"
21#include "auth.h" 20
21#include <linux/ceph/decode.h>
22#include <linux/ceph/mon_client.h>
23#include <linux/ceph/auth.h>
24#include <linux/ceph/debugfs.h>
22 25
23/* 26/*
24 * Ceph superblock operations 27 * Ceph superblock operations
@@ -26,36 +29,22 @@
26 * Handle the basics of mounting, unmounting. 29 * Handle the basics of mounting, unmounting.
27 */ 30 */
28 31
29
30/*
31 * find filename portion of a path (/foo/bar/baz -> baz)
32 */
33const char *ceph_file_part(const char *s, int len)
34{
35 const char *e = s + len;
36
37 while (e != s && *(e-1) != '/')
38 e--;
39 return e;
40}
41
42
43/* 32/*
44 * super ops 33 * super ops
45 */ 34 */
46static void ceph_put_super(struct super_block *s) 35static void ceph_put_super(struct super_block *s)
47{ 36{
48 struct ceph_client *client = ceph_sb_to_client(s); 37 struct ceph_fs_client *fsc = ceph_sb_to_client(s);
49 38
50 dout("put_super\n"); 39 dout("put_super\n");
51 ceph_mdsc_close_sessions(&client->mdsc); 40 ceph_mdsc_close_sessions(fsc->mdsc);
52 41
53 /* 42 /*
54 * ensure we release the bdi before put_anon_super releases 43 * ensure we release the bdi before put_anon_super releases
55 * the device name. 44 * the device name.
56 */ 45 */
57 if (s->s_bdi == &client->backing_dev_info) { 46 if (s->s_bdi == &fsc->backing_dev_info) {
58 bdi_unregister(&client->backing_dev_info); 47 bdi_unregister(&fsc->backing_dev_info);
59 s->s_bdi = NULL; 48 s->s_bdi = NULL;
60 } 49 }
61 50
@@ -64,14 +53,14 @@ static void ceph_put_super(struct super_block *s)
64 53
65static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) 54static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
66{ 55{
67 struct ceph_client *client = ceph_inode_to_client(dentry->d_inode); 56 struct ceph_fs_client *fsc = ceph_inode_to_client(dentry->d_inode);
68 struct ceph_monmap *monmap = client->monc.monmap; 57 struct ceph_monmap *monmap = fsc->client->monc.monmap;
69 struct ceph_statfs st; 58 struct ceph_statfs st;
70 u64 fsid; 59 u64 fsid;
71 int err; 60 int err;
72 61
73 dout("statfs\n"); 62 dout("statfs\n");
74 err = ceph_monc_do_statfs(&client->monc, &st); 63 err = ceph_monc_do_statfs(&fsc->client->monc, &st);
75 if (err < 0) 64 if (err < 0)
76 return err; 65 return err;
77 66
@@ -104,238 +93,28 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
104 93
105static int ceph_sync_fs(struct super_block *sb, int wait) 94static int ceph_sync_fs(struct super_block *sb, int wait)
106{ 95{
107 struct ceph_client *client = ceph_sb_to_client(sb); 96 struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
108 97
109 if (!wait) { 98 if (!wait) {
110 dout("sync_fs (non-blocking)\n"); 99 dout("sync_fs (non-blocking)\n");
111 ceph_flush_dirty_caps(&client->mdsc); 100 ceph_flush_dirty_caps(fsc->mdsc);
112 dout("sync_fs (non-blocking) done\n"); 101 dout("sync_fs (non-blocking) done\n");
113 return 0; 102 return 0;
114 } 103 }
115 104
116 dout("sync_fs (blocking)\n"); 105 dout("sync_fs (blocking)\n");
117 ceph_osdc_sync(&ceph_sb_to_client(sb)->osdc); 106 ceph_osdc_sync(&fsc->client->osdc);
118 ceph_mdsc_sync(&ceph_sb_to_client(sb)->mdsc); 107 ceph_mdsc_sync(fsc->mdsc);
119 dout("sync_fs (blocking) done\n"); 108 dout("sync_fs (blocking) done\n");
120 return 0; 109 return 0;
121} 110}
122 111
123static int default_congestion_kb(void)
124{
125 int congestion_kb;
126
127 /*
128 * Copied from NFS
129 *
130 * congestion size, scale with available memory.
131 *
132 * 64MB: 8192k
133 * 128MB: 11585k
134 * 256MB: 16384k
135 * 512MB: 23170k
136 * 1GB: 32768k
137 * 2GB: 46340k
138 * 4GB: 65536k
139 * 8GB: 92681k
140 * 16GB: 131072k
141 *
142 * This allows larger machines to have larger/more transfers.
143 * Limit the default to 256M
144 */
145 congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
146 if (congestion_kb > 256*1024)
147 congestion_kb = 256*1024;
148
149 return congestion_kb;
150}
151
152/**
153 * ceph_show_options - Show mount options in /proc/mounts
154 * @m: seq_file to write to
155 * @mnt: mount descriptor
156 */
157static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
158{
159 struct ceph_client *client = ceph_sb_to_client(mnt->mnt_sb);
160 struct ceph_mount_args *args = client->mount_args;
161
162 if (args->flags & CEPH_OPT_FSID)
163 seq_printf(m, ",fsid=%pU", &args->fsid);
164 if (args->flags & CEPH_OPT_NOSHARE)
165 seq_puts(m, ",noshare");
166 if (args->flags & CEPH_OPT_DIRSTAT)
167 seq_puts(m, ",dirstat");
168 if ((args->flags & CEPH_OPT_RBYTES) == 0)
169 seq_puts(m, ",norbytes");
170 if (args->flags & CEPH_OPT_NOCRC)
171 seq_puts(m, ",nocrc");
172 if (args->flags & CEPH_OPT_NOASYNCREADDIR)
173 seq_puts(m, ",noasyncreaddir");
174
175 if (args->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
176 seq_printf(m, ",mount_timeout=%d", args->mount_timeout);
177 if (args->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
178 seq_printf(m, ",osd_idle_ttl=%d", args->osd_idle_ttl);
179 if (args->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
180 seq_printf(m, ",osdtimeout=%d", args->osd_timeout);
181 if (args->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
182 seq_printf(m, ",osdkeepalivetimeout=%d",
183 args->osd_keepalive_timeout);
184 if (args->wsize)
185 seq_printf(m, ",wsize=%d", args->wsize);
186 if (args->rsize != CEPH_MOUNT_RSIZE_DEFAULT)
187 seq_printf(m, ",rsize=%d", args->rsize);
188 if (args->congestion_kb != default_congestion_kb())
189 seq_printf(m, ",write_congestion_kb=%d", args->congestion_kb);
190 if (args->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
191 seq_printf(m, ",caps_wanted_delay_min=%d",
192 args->caps_wanted_delay_min);
193 if (args->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
194 seq_printf(m, ",caps_wanted_delay_max=%d",
195 args->caps_wanted_delay_max);
196 if (args->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
197 seq_printf(m, ",cap_release_safety=%d",
198 args->cap_release_safety);
199 if (args->max_readdir != CEPH_MAX_READDIR_DEFAULT)
200 seq_printf(m, ",readdir_max_entries=%d", args->max_readdir);
201 if (args->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
202 seq_printf(m, ",readdir_max_bytes=%d", args->max_readdir_bytes);
203 if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
204 seq_printf(m, ",snapdirname=%s", args->snapdir_name);
205 if (args->name)
206 seq_printf(m, ",name=%s", args->name);
207 if (args->secret)
208 seq_puts(m, ",secret=<hidden>");
209 return 0;
210}
211
212/*
213 * caches
214 */
215struct kmem_cache *ceph_inode_cachep;
216struct kmem_cache *ceph_cap_cachep;
217struct kmem_cache *ceph_dentry_cachep;
218struct kmem_cache *ceph_file_cachep;
219
220static void ceph_inode_init_once(void *foo)
221{
222 struct ceph_inode_info *ci = foo;
223 inode_init_once(&ci->vfs_inode);
224}
225
226static int __init init_caches(void)
227{
228 ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
229 sizeof(struct ceph_inode_info),
230 __alignof__(struct ceph_inode_info),
231 (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
232 ceph_inode_init_once);
233 if (ceph_inode_cachep == NULL)
234 return -ENOMEM;
235
236 ceph_cap_cachep = KMEM_CACHE(ceph_cap,
237 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
238 if (ceph_cap_cachep == NULL)
239 goto bad_cap;
240
241 ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
242 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
243 if (ceph_dentry_cachep == NULL)
244 goto bad_dentry;
245
246 ceph_file_cachep = KMEM_CACHE(ceph_file_info,
247 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
248 if (ceph_file_cachep == NULL)
249 goto bad_file;
250
251 return 0;
252
253bad_file:
254 kmem_cache_destroy(ceph_dentry_cachep);
255bad_dentry:
256 kmem_cache_destroy(ceph_cap_cachep);
257bad_cap:
258 kmem_cache_destroy(ceph_inode_cachep);
259 return -ENOMEM;
260}
261
262static void destroy_caches(void)
263{
264 kmem_cache_destroy(ceph_inode_cachep);
265 kmem_cache_destroy(ceph_cap_cachep);
266 kmem_cache_destroy(ceph_dentry_cachep);
267 kmem_cache_destroy(ceph_file_cachep);
268}
269
270
271/*
272 * ceph_umount_begin - initiate forced umount. Tear down down the
273 * mount, skipping steps that may hang while waiting for server(s).
274 */
275static void ceph_umount_begin(struct super_block *sb)
276{
277 struct ceph_client *client = ceph_sb_to_client(sb);
278
279 dout("ceph_umount_begin - starting forced umount\n");
280 if (!client)
281 return;
282 client->mount_state = CEPH_MOUNT_SHUTDOWN;
283 return;
284}
285
286static const struct super_operations ceph_super_ops = {
287 .alloc_inode = ceph_alloc_inode,
288 .destroy_inode = ceph_destroy_inode,
289 .write_inode = ceph_write_inode,
290 .sync_fs = ceph_sync_fs,
291 .put_super = ceph_put_super,
292 .show_options = ceph_show_options,
293 .statfs = ceph_statfs,
294 .umount_begin = ceph_umount_begin,
295};
296
297
298const char *ceph_msg_type_name(int type)
299{
300 switch (type) {
301 case CEPH_MSG_SHUTDOWN: return "shutdown";
302 case CEPH_MSG_PING: return "ping";
303 case CEPH_MSG_AUTH: return "auth";
304 case CEPH_MSG_AUTH_REPLY: return "auth_reply";
305 case CEPH_MSG_MON_MAP: return "mon_map";
306 case CEPH_MSG_MON_GET_MAP: return "mon_get_map";
307 case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe";
308 case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";
309 case CEPH_MSG_STATFS: return "statfs";
310 case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
311 case CEPH_MSG_MDS_MAP: return "mds_map";
312 case CEPH_MSG_CLIENT_SESSION: return "client_session";
313 case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
314 case CEPH_MSG_CLIENT_REQUEST: return "client_request";
315 case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward";
316 case CEPH_MSG_CLIENT_REPLY: return "client_reply";
317 case CEPH_MSG_CLIENT_CAPS: return "client_caps";
318 case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
319 case CEPH_MSG_CLIENT_SNAP: return "client_snap";
320 case CEPH_MSG_CLIENT_LEASE: return "client_lease";
321 case CEPH_MSG_OSD_MAP: return "osd_map";
322 case CEPH_MSG_OSD_OP: return "osd_op";
323 case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
324 default: return "unknown";
325 }
326}
327
328
329/* 112/*
330 * mount options 113 * mount options
331 */ 114 */
332enum { 115enum {
333 Opt_wsize, 116 Opt_wsize,
334 Opt_rsize, 117 Opt_rsize,
335 Opt_osdtimeout,
336 Opt_osdkeepalivetimeout,
337 Opt_mount_timeout,
338 Opt_osd_idle_ttl,
339 Opt_caps_wanted_delay_min, 118 Opt_caps_wanted_delay_min,
340 Opt_caps_wanted_delay_max, 119 Opt_caps_wanted_delay_max,
341 Opt_cap_release_safety, 120 Opt_cap_release_safety,
@@ -344,29 +123,19 @@ enum {
344 Opt_congestion_kb, 123 Opt_congestion_kb,
345 Opt_last_int, 124 Opt_last_int,
346 /* int args above */ 125 /* int args above */
347 Opt_fsid,
348 Opt_snapdirname, 126 Opt_snapdirname,
349 Opt_name,
350 Opt_secret,
351 Opt_last_string, 127 Opt_last_string,
352 /* string args above */ 128 /* string args above */
353 Opt_ip,
354 Opt_noshare,
355 Opt_dirstat, 129 Opt_dirstat,
356 Opt_nodirstat, 130 Opt_nodirstat,
357 Opt_rbytes, 131 Opt_rbytes,
358 Opt_norbytes, 132 Opt_norbytes,
359 Opt_nocrc,
360 Opt_noasyncreaddir, 133 Opt_noasyncreaddir,
361}; 134};
362 135
363static match_table_t arg_tokens = { 136static match_table_t fsopt_tokens = {
364 {Opt_wsize, "wsize=%d"}, 137 {Opt_wsize, "wsize=%d"},
365 {Opt_rsize, "rsize=%d"}, 138 {Opt_rsize, "rsize=%d"},
366 {Opt_osdtimeout, "osdtimeout=%d"},
367 {Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
368 {Opt_mount_timeout, "mount_timeout=%d"},
369 {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
370 {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"}, 139 {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
371 {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"}, 140 {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
372 {Opt_cap_release_safety, "cap_release_safety=%d"}, 141 {Opt_cap_release_safety, "cap_release_safety=%d"},
@@ -374,403 +143,459 @@ static match_table_t arg_tokens = {
374 {Opt_readdir_max_bytes, "readdir_max_bytes=%d"}, 143 {Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
375 {Opt_congestion_kb, "write_congestion_kb=%d"}, 144 {Opt_congestion_kb, "write_congestion_kb=%d"},
376 /* int args above */ 145 /* int args above */
377 {Opt_fsid, "fsid=%s"},
378 {Opt_snapdirname, "snapdirname=%s"}, 146 {Opt_snapdirname, "snapdirname=%s"},
379 {Opt_name, "name=%s"},
380 {Opt_secret, "secret=%s"},
381 /* string args above */ 147 /* string args above */
382 {Opt_ip, "ip=%s"},
383 {Opt_noshare, "noshare"},
384 {Opt_dirstat, "dirstat"}, 148 {Opt_dirstat, "dirstat"},
385 {Opt_nodirstat, "nodirstat"}, 149 {Opt_nodirstat, "nodirstat"},
386 {Opt_rbytes, "rbytes"}, 150 {Opt_rbytes, "rbytes"},
387 {Opt_norbytes, "norbytes"}, 151 {Opt_norbytes, "norbytes"},
388 {Opt_nocrc, "nocrc"},
389 {Opt_noasyncreaddir, "noasyncreaddir"}, 152 {Opt_noasyncreaddir, "noasyncreaddir"},
390 {-1, NULL} 153 {-1, NULL}
391}; 154};
392 155
393static int parse_fsid(const char *str, struct ceph_fsid *fsid) 156static int parse_fsopt_token(char *c, void *private)
394{ 157{
395 int i = 0; 158 struct ceph_mount_options *fsopt = private;
396 char tmp[3]; 159 substring_t argstr[MAX_OPT_ARGS];
397 int err = -EINVAL; 160 int token, intval, ret;
398 int d; 161
399 162 token = match_token((char *)c, fsopt_tokens, argstr);
400 dout("parse_fsid '%s'\n", str); 163 if (token < 0)
401 tmp[2] = 0; 164 return -EINVAL;
402 while (*str && i < 16) { 165
403 if (ispunct(*str)) { 166 if (token < Opt_last_int) {
404 str++; 167 ret = match_int(&argstr[0], &intval);
405 continue; 168 if (ret < 0) {
169 pr_err("bad mount option arg (not int) "
170 "at '%s'\n", c);
171 return ret;
406 } 172 }
407 if (!isxdigit(str[0]) || !isxdigit(str[1])) 173 dout("got int token %d val %d\n", token, intval);
408 break; 174 } else if (token > Opt_last_int && token < Opt_last_string) {
409 tmp[0] = str[0]; 175 dout("got string token %d val %s\n", token,
410 tmp[1] = str[1]; 176 argstr[0].from);
411 if (sscanf(tmp, "%x", &d) < 1) 177 } else {
412 break; 178 dout("got token %d\n", token);
413 fsid->fsid[i] = d & 0xff;
414 i++;
415 str += 2;
416 } 179 }
417 180
418 if (i == 16) 181 switch (token) {
419 err = 0; 182 case Opt_snapdirname:
420 dout("parse_fsid ret %d got fsid %pU", err, fsid); 183 kfree(fsopt->snapdir_name);
421 return err; 184 fsopt->snapdir_name = kstrndup(argstr[0].from,
185 argstr[0].to-argstr[0].from,
186 GFP_KERNEL);
187 if (!fsopt->snapdir_name)
188 return -ENOMEM;
189 break;
190
191 /* misc */
192 case Opt_wsize:
193 fsopt->wsize = intval;
194 break;
195 case Opt_rsize:
196 fsopt->rsize = intval;
197 break;
198 case Opt_caps_wanted_delay_min:
199 fsopt->caps_wanted_delay_min = intval;
200 break;
201 case Opt_caps_wanted_delay_max:
202 fsopt->caps_wanted_delay_max = intval;
203 break;
204 case Opt_readdir_max_entries:
205 fsopt->max_readdir = intval;
206 break;
207 case Opt_readdir_max_bytes:
208 fsopt->max_readdir_bytes = intval;
209 break;
210 case Opt_congestion_kb:
211 fsopt->congestion_kb = intval;
212 break;
213 case Opt_dirstat:
214 fsopt->flags |= CEPH_MOUNT_OPT_DIRSTAT;
215 break;
216 case Opt_nodirstat:
217 fsopt->flags &= ~CEPH_MOUNT_OPT_DIRSTAT;
218 break;
219 case Opt_rbytes:
220 fsopt->flags |= CEPH_MOUNT_OPT_RBYTES;
221 break;
222 case Opt_norbytes:
223 fsopt->flags &= ~CEPH_MOUNT_OPT_RBYTES;
224 break;
225 case Opt_noasyncreaddir:
226 fsopt->flags |= CEPH_MOUNT_OPT_NOASYNCREADDIR;
227 break;
228 default:
229 BUG_ON(token);
230 }
231 return 0;
422} 232}
423 233
424static struct ceph_mount_args *parse_mount_args(int flags, char *options, 234static void destroy_mount_options(struct ceph_mount_options *args)
425 const char *dev_name,
426 const char **path)
427{ 235{
428 struct ceph_mount_args *args; 236 dout("destroy_mount_options %p\n", args);
429 const char *c; 237 kfree(args->snapdir_name);
430 int err = -ENOMEM; 238 kfree(args);
431 substring_t argstr[MAX_OPT_ARGS]; 239}
432 240
433 args = kzalloc(sizeof(*args), GFP_KERNEL); 241static int strcmp_null(const char *s1, const char *s2)
434 if (!args) 242{
435 return ERR_PTR(-ENOMEM); 243 if (!s1 && !s2)
436 args->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*args->mon_addr), 244 return 0;
437 GFP_KERNEL); 245 if (s1 && !s2)
438 if (!args->mon_addr) 246 return -1;
439 goto out; 247 if (!s1 && s2)
248 return 1;
249 return strcmp(s1, s2);
250}
440 251
441 dout("parse_mount_args %p, dev_name '%s'\n", args, dev_name); 252static int compare_mount_options(struct ceph_mount_options *new_fsopt,
442 253 struct ceph_options *new_opt,
443 /* start with defaults */ 254 struct ceph_fs_client *fsc)
444 args->sb_flags = flags; 255{
445 args->flags = CEPH_OPT_DEFAULT; 256 struct ceph_mount_options *fsopt1 = new_fsopt;
446 args->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT; 257 struct ceph_mount_options *fsopt2 = fsc->mount_options;
447 args->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT; 258 int ofs = offsetof(struct ceph_mount_options, snapdir_name);
448 args->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */ 259 int ret;
449 args->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */
450 args->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
451 args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
452 args->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
453 args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
454 args->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
455 args->max_readdir = CEPH_MAX_READDIR_DEFAULT;
456 args->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
457 args->congestion_kb = default_congestion_kb();
458
459 /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
460 err = -EINVAL;
461 if (!dev_name)
462 goto out;
463 *path = strstr(dev_name, ":/");
464 if (*path == NULL) {
465 pr_err("device name is missing path (no :/ in %s)\n",
466 dev_name);
467 goto out;
468 }
469 260
470 /* get mon ip(s) */ 261 ret = memcmp(fsopt1, fsopt2, ofs);
471 err = ceph_parse_ips(dev_name, *path, args->mon_addr, 262 if (ret)
472 CEPH_MAX_MON, &args->num_mon); 263 return ret;
473 if (err < 0) 264
474 goto out; 265 ret = strcmp_null(fsopt1->snapdir_name, fsopt2->snapdir_name);
266 if (ret)
267 return ret;
268
269 return ceph_compare_options(new_opt, fsc->client);
270}
271
272static int parse_mount_options(struct ceph_mount_options **pfsopt,
273 struct ceph_options **popt,
274 int flags, char *options,
275 const char *dev_name,
276 const char **path)
277{
278 struct ceph_mount_options *fsopt;
279 const char *dev_name_end;
280 int err = -ENOMEM;
281
282 fsopt = kzalloc(sizeof(*fsopt), GFP_KERNEL);
283 if (!fsopt)
284 return -ENOMEM;
285
286 dout("parse_mount_options %p, dev_name '%s'\n", fsopt, dev_name);
287
288 fsopt->sb_flags = flags;
289 fsopt->flags = CEPH_MOUNT_OPT_DEFAULT;
290
291 fsopt->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
292 fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
293 fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
294 fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
295 fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
296 fsopt->congestion_kb = default_congestion_kb();
297
298 /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
299 err = -EINVAL;
300 if (!dev_name)
301 goto out;
302 *path = strstr(dev_name, ":/");
303 if (*path == NULL) {
304 pr_err("device name is missing path (no :/ in %s)\n",
305 dev_name);
306 goto out;
307 }
308 dev_name_end = *path;
309 dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name);
475 310
476 /* path on server */ 311 /* path on server */
477 *path += 2; 312 *path += 2;
478 dout("server path '%s'\n", *path); 313 dout("server path '%s'\n", *path);
479 314
480 /* parse mount options */ 315 err = ceph_parse_options(popt, options, dev_name, dev_name_end,
481 while ((c = strsep(&options, ",")) != NULL) { 316 parse_fsopt_token, (void *)fsopt);
482 int token, intval, ret; 317 if (err)
483 if (!*c) 318 goto out;
484 continue; 319
485 err = -EINVAL; 320 /* success */
486 token = match_token((char *)c, arg_tokens, argstr); 321 *pfsopt = fsopt;
487 if (token < 0) { 322 return 0;
488 pr_err("bad mount option at '%s'\n", c);
489 goto out;
490 }
491 if (token < Opt_last_int) {
492 ret = match_int(&argstr[0], &intval);
493 if (ret < 0) {
494 pr_err("bad mount option arg (not int) "
495 "at '%s'\n", c);
496 continue;
497 }
498 dout("got int token %d val %d\n", token, intval);
499 } else if (token > Opt_last_int && token < Opt_last_string) {
500 dout("got string token %d val %s\n", token,
501 argstr[0].from);
502 } else {
503 dout("got token %d\n", token);
504 }
505 switch (token) {
506 case Opt_ip:
507 err = ceph_parse_ips(argstr[0].from,
508 argstr[0].to,
509 &args->my_addr,
510 1, NULL);
511 if (err < 0)
512 goto out;
513 args->flags |= CEPH_OPT_MYIP;
514 break;
515
516 case Opt_fsid:
517 err = parse_fsid(argstr[0].from, &args->fsid);
518 if (err == 0)
519 args->flags |= CEPH_OPT_FSID;
520 break;
521 case Opt_snapdirname:
522 kfree(args->snapdir_name);
523 args->snapdir_name = kstrndup(argstr[0].from,
524 argstr[0].to-argstr[0].from,
525 GFP_KERNEL);
526 break;
527 case Opt_name:
528 args->name = kstrndup(argstr[0].from,
529 argstr[0].to-argstr[0].from,
530 GFP_KERNEL);
531 break;
532 case Opt_secret:
533 args->secret = kstrndup(argstr[0].from,
534 argstr[0].to-argstr[0].from,
535 GFP_KERNEL);
536 break;
537
538 /* misc */
539 case Opt_wsize:
540 args->wsize = intval;
541 break;
542 case Opt_rsize:
543 args->rsize = intval;
544 break;
545 case Opt_osdtimeout:
546 args->osd_timeout = intval;
547 break;
548 case Opt_osdkeepalivetimeout:
549 args->osd_keepalive_timeout = intval;
550 break;
551 case Opt_osd_idle_ttl:
552 args->osd_idle_ttl = intval;
553 break;
554 case Opt_mount_timeout:
555 args->mount_timeout = intval;
556 break;
557 case Opt_caps_wanted_delay_min:
558 args->caps_wanted_delay_min = intval;
559 break;
560 case Opt_caps_wanted_delay_max:
561 args->caps_wanted_delay_max = intval;
562 break;
563 case Opt_readdir_max_entries:
564 args->max_readdir = intval;
565 break;
566 case Opt_readdir_max_bytes:
567 args->max_readdir_bytes = intval;
568 break;
569 case Opt_congestion_kb:
570 args->congestion_kb = intval;
571 break;
572
573 case Opt_noshare:
574 args->flags |= CEPH_OPT_NOSHARE;
575 break;
576
577 case Opt_dirstat:
578 args->flags |= CEPH_OPT_DIRSTAT;
579 break;
580 case Opt_nodirstat:
581 args->flags &= ~CEPH_OPT_DIRSTAT;
582 break;
583 case Opt_rbytes:
584 args->flags |= CEPH_OPT_RBYTES;
585 break;
586 case Opt_norbytes:
587 args->flags &= ~CEPH_OPT_RBYTES;
588 break;
589 case Opt_nocrc:
590 args->flags |= CEPH_OPT_NOCRC;
591 break;
592 case Opt_noasyncreaddir:
593 args->flags |= CEPH_OPT_NOASYNCREADDIR;
594 break;
595
596 default:
597 BUG_ON(token);
598 }
599 }
600 return args;
601 323
602out: 324out:
603 kfree(args->mon_addr); 325 destroy_mount_options(fsopt);
604 kfree(args); 326 return err;
605 return ERR_PTR(err);
606} 327}
607 328
608static void destroy_mount_args(struct ceph_mount_args *args) 329/**
330 * ceph_show_options - Show mount options in /proc/mounts
331 * @m: seq_file to write to
332 * @mnt: mount descriptor
333 */
334static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
609{ 335{
610 dout("destroy_mount_args %p\n", args); 336 struct ceph_fs_client *fsc = ceph_sb_to_client(mnt->mnt_sb);
611 kfree(args->snapdir_name); 337 struct ceph_mount_options *fsopt = fsc->mount_options;
612 args->snapdir_name = NULL; 338 struct ceph_options *opt = fsc->client->options;
613 kfree(args->name); 339
614 args->name = NULL; 340 if (opt->flags & CEPH_OPT_FSID)
615 kfree(args->secret); 341 seq_printf(m, ",fsid=%pU", &opt->fsid);
616 args->secret = NULL; 342 if (opt->flags & CEPH_OPT_NOSHARE)
617 kfree(args); 343 seq_puts(m, ",noshare");
344 if (opt->flags & CEPH_OPT_NOCRC)
345 seq_puts(m, ",nocrc");
346
347 if (opt->name)
348 seq_printf(m, ",name=%s", opt->name);
349 if (opt->secret)
350 seq_puts(m, ",secret=<hidden>");
351
352 if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
353 seq_printf(m, ",mount_timeout=%d", opt->mount_timeout);
354 if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
355 seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl);
356 if (opt->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
357 seq_printf(m, ",osdtimeout=%d", opt->osd_timeout);
358 if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
359 seq_printf(m, ",osdkeepalivetimeout=%d",
360 opt->osd_keepalive_timeout);
361
362 if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT)
363 seq_puts(m, ",dirstat");
364 if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES) == 0)
365 seq_puts(m, ",norbytes");
366 if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR)
367 seq_puts(m, ",noasyncreaddir");
368
369 if (fsopt->wsize)
370 seq_printf(m, ",wsize=%d", fsopt->wsize);
371 if (fsopt->rsize != CEPH_MOUNT_RSIZE_DEFAULT)
372 seq_printf(m, ",rsize=%d", fsopt->rsize);
373 if (fsopt->congestion_kb != default_congestion_kb())
374 seq_printf(m, ",write_congestion_kb=%d", fsopt->congestion_kb);
375 if (fsopt->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
376 seq_printf(m, ",caps_wanted_delay_min=%d",
377 fsopt->caps_wanted_delay_min);
378 if (fsopt->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
379 seq_printf(m, ",caps_wanted_delay_max=%d",
380 fsopt->caps_wanted_delay_max);
381 if (fsopt->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
382 seq_printf(m, ",cap_release_safety=%d",
383 fsopt->cap_release_safety);
384 if (fsopt->max_readdir != CEPH_MAX_READDIR_DEFAULT)
385 seq_printf(m, ",readdir_max_entries=%d", fsopt->max_readdir);
386 if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
387 seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes);
388 if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
389 seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name);
390 return 0;
618} 391}
619 392
620/* 393/*
621 * create a fresh client instance 394 * handle any mon messages the standard library doesn't understand.
395 * return error if we don't either.
622 */ 396 */
623static struct ceph_client *ceph_create_client(struct ceph_mount_args *args) 397static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg)
624{ 398{
625 struct ceph_client *client; 399 struct ceph_fs_client *fsc = client->private;
400 int type = le16_to_cpu(msg->hdr.type);
401
402 switch (type) {
403 case CEPH_MSG_MDS_MAP:
404 ceph_mdsc_handle_map(fsc->mdsc, msg);
405 return 0;
406
407 default:
408 return -1;
409 }
410}
411
412/*
413 * create a new fs client
414 */
415struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
416 struct ceph_options *opt)
417{
418 struct ceph_fs_client *fsc;
626 int err = -ENOMEM; 419 int err = -ENOMEM;
627 420
628 client = kzalloc(sizeof(*client), GFP_KERNEL); 421 fsc = kzalloc(sizeof(*fsc), GFP_KERNEL);
629 if (client == NULL) 422 if (!fsc)
630 return ERR_PTR(-ENOMEM); 423 return ERR_PTR(-ENOMEM);
631 424
632 mutex_init(&client->mount_mutex); 425 fsc->client = ceph_create_client(opt, fsc);
633 426 if (IS_ERR(fsc->client)) {
634 init_waitqueue_head(&client->auth_wq); 427 err = PTR_ERR(fsc->client);
428 goto fail;
429 }
430 fsc->client->extra_mon_dispatch = extra_mon_dispatch;
431 fsc->client->supported_features |= CEPH_FEATURE_FLOCK;
432 fsc->client->monc.want_mdsmap = 1;
635 433
636 client->sb = NULL; 434 fsc->mount_options = fsopt;
637 client->mount_state = CEPH_MOUNT_MOUNTING;
638 client->mount_args = args;
639 435
640 client->msgr = NULL; 436 fsc->sb = NULL;
437 fsc->mount_state = CEPH_MOUNT_MOUNTING;
641 438
642 client->auth_err = 0; 439 atomic_long_set(&fsc->writeback_count, 0);
643 atomic_long_set(&client->writeback_count, 0);
644 440
645 err = bdi_init(&client->backing_dev_info); 441 err = bdi_init(&fsc->backing_dev_info);
646 if (err < 0) 442 if (err < 0)
647 goto fail; 443 goto fail_client;
648 444
649 err = -ENOMEM; 445 err = -ENOMEM;
650 client->wb_wq = create_workqueue("ceph-writeback"); 446 fsc->wb_wq = create_workqueue("ceph-writeback");
651 if (client->wb_wq == NULL) 447 if (fsc->wb_wq == NULL)
652 goto fail_bdi; 448 goto fail_bdi;
653 client->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid"); 449 fsc->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid");
654 if (client->pg_inv_wq == NULL) 450 if (fsc->pg_inv_wq == NULL)
655 goto fail_wb_wq; 451 goto fail_wb_wq;
656 client->trunc_wq = create_singlethread_workqueue("ceph-trunc"); 452 fsc->trunc_wq = create_singlethread_workqueue("ceph-trunc");
657 if (client->trunc_wq == NULL) 453 if (fsc->trunc_wq == NULL)
658 goto fail_pg_inv_wq; 454 goto fail_pg_inv_wq;
659 455
660 /* set up mempools */ 456 /* set up mempools */
661 err = -ENOMEM; 457 err = -ENOMEM;
662 client->wb_pagevec_pool = mempool_create_kmalloc_pool(10, 458 fsc->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
663 client->mount_args->wsize >> PAGE_CACHE_SHIFT); 459 fsc->mount_options->wsize >> PAGE_CACHE_SHIFT);
664 if (!client->wb_pagevec_pool) 460 if (!fsc->wb_pagevec_pool)
665 goto fail_trunc_wq; 461 goto fail_trunc_wq;
666 462
667 /* caps */ 463 /* caps */
668 client->min_caps = args->max_readdir; 464 fsc->min_caps = fsopt->max_readdir;
465
466 return fsc;
669 467
670 /* subsystems */
671 err = ceph_monc_init(&client->monc, client);
672 if (err < 0)
673 goto fail_mempool;
674 err = ceph_osdc_init(&client->osdc, client);
675 if (err < 0)
676 goto fail_monc;
677 err = ceph_mdsc_init(&client->mdsc, client);
678 if (err < 0)
679 goto fail_osdc;
680 return client;
681
682fail_osdc:
683 ceph_osdc_stop(&client->osdc);
684fail_monc:
685 ceph_monc_stop(&client->monc);
686fail_mempool:
687 mempool_destroy(client->wb_pagevec_pool);
688fail_trunc_wq: 468fail_trunc_wq:
689 destroy_workqueue(client->trunc_wq); 469 destroy_workqueue(fsc->trunc_wq);
690fail_pg_inv_wq: 470fail_pg_inv_wq:
691 destroy_workqueue(client->pg_inv_wq); 471 destroy_workqueue(fsc->pg_inv_wq);
692fail_wb_wq: 472fail_wb_wq:
693 destroy_workqueue(client->wb_wq); 473 destroy_workqueue(fsc->wb_wq);
694fail_bdi: 474fail_bdi:
695 bdi_destroy(&client->backing_dev_info); 475 bdi_destroy(&fsc->backing_dev_info);
476fail_client:
477 ceph_destroy_client(fsc->client);
696fail: 478fail:
697 kfree(client); 479 kfree(fsc);
698 return ERR_PTR(err); 480 return ERR_PTR(err);
699} 481}
700 482
701static void ceph_destroy_client(struct ceph_client *client) 483void destroy_fs_client(struct ceph_fs_client *fsc)
702{ 484{
703 dout("destroy_client %p\n", client); 485 dout("destroy_fs_client %p\n", fsc);
704 486
705 /* unmount */ 487 destroy_workqueue(fsc->wb_wq);
706 ceph_mdsc_stop(&client->mdsc); 488 destroy_workqueue(fsc->pg_inv_wq);
707 ceph_osdc_stop(&client->osdc); 489 destroy_workqueue(fsc->trunc_wq);
708 490
709 /* 491 bdi_destroy(&fsc->backing_dev_info);
710 * make sure mds and osd connections close out before destroying
711 * the auth module, which is needed to free those connections'
712 * ceph_authorizers.
713 */
714 ceph_msgr_flush();
715
716 ceph_monc_stop(&client->monc);
717 492
718 ceph_debugfs_client_cleanup(client); 493 mempool_destroy(fsc->wb_pagevec_pool);
719 destroy_workqueue(client->wb_wq);
720 destroy_workqueue(client->pg_inv_wq);
721 destroy_workqueue(client->trunc_wq);
722 494
723 bdi_destroy(&client->backing_dev_info); 495 destroy_mount_options(fsc->mount_options);
724 496
725 if (client->msgr) 497 ceph_fs_debugfs_cleanup(fsc);
726 ceph_messenger_destroy(client->msgr);
727 mempool_destroy(client->wb_pagevec_pool);
728 498
729 destroy_mount_args(client->mount_args); 499 ceph_destroy_client(fsc->client);
730 500
731 kfree(client); 501 kfree(fsc);
732 dout("destroy_client %p done\n", client); 502 dout("destroy_fs_client %p done\n", fsc);
733} 503}
734 504
735/* 505/*
736 * Initially learn our fsid, or verify an fsid matches. 506 * caches
737 */ 507 */
738int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid) 508struct kmem_cache *ceph_inode_cachep;
509struct kmem_cache *ceph_cap_cachep;
510struct kmem_cache *ceph_dentry_cachep;
511struct kmem_cache *ceph_file_cachep;
512
513static void ceph_inode_init_once(void *foo)
739{ 514{
740 if (client->have_fsid) { 515 struct ceph_inode_info *ci = foo;
741 if (ceph_fsid_compare(&client->fsid, fsid)) { 516 inode_init_once(&ci->vfs_inode);
742 pr_err("bad fsid, had %pU got %pU", 517}
743 &client->fsid, fsid); 518
744 return -1; 519static int __init init_caches(void)
745 } 520{
746 } else { 521 ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
747 pr_info("client%lld fsid %pU\n", client->monc.auth->global_id, 522 sizeof(struct ceph_inode_info),
748 fsid); 523 __alignof__(struct ceph_inode_info),
749 memcpy(&client->fsid, fsid, sizeof(*fsid)); 524 (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
750 ceph_debugfs_client_init(client); 525 ceph_inode_init_once);
751 client->have_fsid = true; 526 if (ceph_inode_cachep == NULL)
752 } 527 return -ENOMEM;
528
529 ceph_cap_cachep = KMEM_CACHE(ceph_cap,
530 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
531 if (ceph_cap_cachep == NULL)
532 goto bad_cap;
533
534 ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
535 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
536 if (ceph_dentry_cachep == NULL)
537 goto bad_dentry;
538
539 ceph_file_cachep = KMEM_CACHE(ceph_file_info,
540 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
541 if (ceph_file_cachep == NULL)
542 goto bad_file;
543
753 return 0; 544 return 0;
545
546bad_file:
547 kmem_cache_destroy(ceph_dentry_cachep);
548bad_dentry:
549 kmem_cache_destroy(ceph_cap_cachep);
550bad_cap:
551 kmem_cache_destroy(ceph_inode_cachep);
552 return -ENOMEM;
754} 553}
755 554
555static void destroy_caches(void)
556{
557 kmem_cache_destroy(ceph_inode_cachep);
558 kmem_cache_destroy(ceph_cap_cachep);
559 kmem_cache_destroy(ceph_dentry_cachep);
560 kmem_cache_destroy(ceph_file_cachep);
561}
562
563
756/* 564/*
757 * true if we have the mon map (and have thus joined the cluster) 565 * ceph_umount_begin - initiate forced umount. Tear down down the
566 * mount, skipping steps that may hang while waiting for server(s).
758 */ 567 */
759static int have_mon_and_osd_map(struct ceph_client *client) 568static void ceph_umount_begin(struct super_block *sb)
760{ 569{
761 return client->monc.monmap && client->monc.monmap->epoch && 570 struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
762 client->osdc.osdmap && client->osdc.osdmap->epoch; 571
572 dout("ceph_umount_begin - starting forced umount\n");
573 if (!fsc)
574 return;
575 fsc->mount_state = CEPH_MOUNT_SHUTDOWN;
576 return;
763} 577}
764 578
579static const struct super_operations ceph_super_ops = {
580 .alloc_inode = ceph_alloc_inode,
581 .destroy_inode = ceph_destroy_inode,
582 .write_inode = ceph_write_inode,
583 .sync_fs = ceph_sync_fs,
584 .put_super = ceph_put_super,
585 .show_options = ceph_show_options,
586 .statfs = ceph_statfs,
587 .umount_begin = ceph_umount_begin,
588};
589
765/* 590/*
766 * Bootstrap mount by opening the root directory. Note the mount 591 * Bootstrap mount by opening the root directory. Note the mount
767 * @started time from caller, and time out if this takes too long. 592 * @started time from caller, and time out if this takes too long.
768 */ 593 */
769static struct dentry *open_root_dentry(struct ceph_client *client, 594static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
770 const char *path, 595 const char *path,
771 unsigned long started) 596 unsigned long started)
772{ 597{
773 struct ceph_mds_client *mdsc = &client->mdsc; 598 struct ceph_mds_client *mdsc = fsc->mdsc;
774 struct ceph_mds_request *req = NULL; 599 struct ceph_mds_request *req = NULL;
775 int err; 600 int err;
776 struct dentry *root; 601 struct dentry *root;
@@ -784,14 +609,14 @@ static struct dentry *open_root_dentry(struct ceph_client *client,
784 req->r_ino1.ino = CEPH_INO_ROOT; 609 req->r_ino1.ino = CEPH_INO_ROOT;
785 req->r_ino1.snap = CEPH_NOSNAP; 610 req->r_ino1.snap = CEPH_NOSNAP;
786 req->r_started = started; 611 req->r_started = started;
787 req->r_timeout = client->mount_args->mount_timeout * HZ; 612 req->r_timeout = fsc->client->options->mount_timeout * HZ;
788 req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE); 613 req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
789 req->r_num_caps = 2; 614 req->r_num_caps = 2;
790 err = ceph_mdsc_do_request(mdsc, NULL, req); 615 err = ceph_mdsc_do_request(mdsc, NULL, req);
791 if (err == 0) { 616 if (err == 0) {
792 dout("open_root_inode success\n"); 617 dout("open_root_inode success\n");
793 if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT && 618 if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT &&
794 client->sb->s_root == NULL) 619 fsc->sb->s_root == NULL)
795 root = d_alloc_root(req->r_target_inode); 620 root = d_alloc_root(req->r_target_inode);
796 else 621 else
797 root = d_obtain_alias(req->r_target_inode); 622 root = d_obtain_alias(req->r_target_inode);
@@ -804,105 +629,86 @@ static struct dentry *open_root_dentry(struct ceph_client *client,
804 return root; 629 return root;
805} 630}
806 631
632
633
634
807/* 635/*
808 * mount: join the ceph cluster, and open root directory. 636 * mount: join the ceph cluster, and open root directory.
809 */ 637 */
810static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt, 638static int ceph_mount(struct ceph_fs_client *fsc, struct vfsmount *mnt,
811 const char *path) 639 const char *path)
812{ 640{
813 struct ceph_entity_addr *myaddr = NULL;
814 int err; 641 int err;
815 unsigned long timeout = client->mount_args->mount_timeout * HZ;
816 unsigned long started = jiffies; /* note the start time */ 642 unsigned long started = jiffies; /* note the start time */
817 struct dentry *root; 643 struct dentry *root;
644 int first = 0; /* first vfsmount for this super_block */
818 645
819 dout("mount start\n"); 646 dout("mount start\n");
820 mutex_lock(&client->mount_mutex); 647 mutex_lock(&fsc->client->mount_mutex);
821
822 /* initialize the messenger */
823 if (client->msgr == NULL) {
824 if (ceph_test_opt(client, MYIP))
825 myaddr = &client->mount_args->my_addr;
826 client->msgr = ceph_messenger_create(myaddr);
827 if (IS_ERR(client->msgr)) {
828 err = PTR_ERR(client->msgr);
829 client->msgr = NULL;
830 goto out;
831 }
832 client->msgr->nocrc = ceph_test_opt(client, NOCRC);
833 }
834 648
835 /* open session, and wait for mon, mds, and osd maps */ 649 err = __ceph_open_session(fsc->client, started);
836 err = ceph_monc_open_session(&client->monc);
837 if (err < 0) 650 if (err < 0)
838 goto out; 651 goto out;
839 652
840 while (!have_mon_and_osd_map(client)) {
841 err = -EIO;
842 if (timeout && time_after_eq(jiffies, started + timeout))
843 goto out;
844
845 /* wait */
846 dout("mount waiting for mon_map\n");
847 err = wait_event_interruptible_timeout(client->auth_wq,
848 have_mon_and_osd_map(client) || (client->auth_err < 0),
849 timeout);
850 if (err == -EINTR || err == -ERESTARTSYS)
851 goto out;
852 if (client->auth_err < 0) {
853 err = client->auth_err;
854 goto out;
855 }
856 }
857
858 dout("mount opening root\n"); 653 dout("mount opening root\n");
859 root = open_root_dentry(client, "", started); 654 root = open_root_dentry(fsc, "", started);
860 if (IS_ERR(root)) { 655 if (IS_ERR(root)) {
861 err = PTR_ERR(root); 656 err = PTR_ERR(root);
862 goto out; 657 goto out;
863 } 658 }
864 if (client->sb->s_root) 659 if (fsc->sb->s_root) {
865 dput(root); 660 dput(root);
866 else 661 } else {
867 client->sb->s_root = root; 662 fsc->sb->s_root = root;
663 first = 1;
664
665 err = ceph_fs_debugfs_init(fsc);
666 if (err < 0)
667 goto fail;
668 }
868 669
869 if (path[0] == 0) { 670 if (path[0] == 0) {
870 dget(root); 671 dget(root);
871 } else { 672 } else {
872 dout("mount opening base mountpoint\n"); 673 dout("mount opening base mountpoint\n");
873 root = open_root_dentry(client, path, started); 674 root = open_root_dentry(fsc, path, started);
874 if (IS_ERR(root)) { 675 if (IS_ERR(root)) {
875 err = PTR_ERR(root); 676 err = PTR_ERR(root);
876 dput(client->sb->s_root); 677 goto fail;
877 client->sb->s_root = NULL;
878 goto out;
879 } 678 }
880 } 679 }
881 680
882 mnt->mnt_root = root; 681 mnt->mnt_root = root;
883 mnt->mnt_sb = client->sb; 682 mnt->mnt_sb = fsc->sb;
884 683
885 client->mount_state = CEPH_MOUNT_MOUNTED; 684 fsc->mount_state = CEPH_MOUNT_MOUNTED;
886 dout("mount success\n"); 685 dout("mount success\n");
887 err = 0; 686 err = 0;
888 687
889out: 688out:
890 mutex_unlock(&client->mount_mutex); 689 mutex_unlock(&fsc->client->mount_mutex);
891 return err; 690 return err;
691
692fail:
693 if (first) {
694 dput(fsc->sb->s_root);
695 fsc->sb->s_root = NULL;
696 }
697 goto out;
892} 698}
893 699
894static int ceph_set_super(struct super_block *s, void *data) 700static int ceph_set_super(struct super_block *s, void *data)
895{ 701{
896 struct ceph_client *client = data; 702 struct ceph_fs_client *fsc = data;
897 int ret; 703 int ret;
898 704
899 dout("set_super %p data %p\n", s, data); 705 dout("set_super %p data %p\n", s, data);
900 706
901 s->s_flags = client->mount_args->sb_flags; 707 s->s_flags = fsc->mount_options->sb_flags;
902 s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */ 708 s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */
903 709
904 s->s_fs_info = client; 710 s->s_fs_info = fsc;
905 client->sb = s; 711 fsc->sb = s;
906 712
907 s->s_op = &ceph_super_ops; 713 s->s_op = &ceph_super_ops;
908 s->s_export_op = &ceph_export_ops; 714 s->s_export_op = &ceph_export_ops;
@@ -917,7 +723,7 @@ static int ceph_set_super(struct super_block *s, void *data)
917 723
918fail: 724fail:
919 s->s_fs_info = NULL; 725 s->s_fs_info = NULL;
920 client->sb = NULL; 726 fsc->sb = NULL;
921 return ret; 727 return ret;
922} 728}
923 729
@@ -926,30 +732,23 @@ fail:
926 */ 732 */
927static int ceph_compare_super(struct super_block *sb, void *data) 733static int ceph_compare_super(struct super_block *sb, void *data)
928{ 734{
929 struct ceph_client *new = data; 735 struct ceph_fs_client *new = data;
930 struct ceph_mount_args *args = new->mount_args; 736 struct ceph_mount_options *fsopt = new->mount_options;
931 struct ceph_client *other = ceph_sb_to_client(sb); 737 struct ceph_options *opt = new->client->options;
932 int i; 738 struct ceph_fs_client *other = ceph_sb_to_client(sb);
933 739
934 dout("ceph_compare_super %p\n", sb); 740 dout("ceph_compare_super %p\n", sb);
935 if (args->flags & CEPH_OPT_FSID) { 741
936 if (ceph_fsid_compare(&args->fsid, &other->fsid)) { 742 if (compare_mount_options(fsopt, opt, other)) {
937 dout("fsid doesn't match\n"); 743 dout("monitor(s)/mount options don't match\n");
938 return 0; 744 return 0;
939 }
940 } else {
941 /* do we share (a) monitor? */
942 for (i = 0; i < new->monc.monmap->num_mon; i++)
943 if (ceph_monmap_contains(other->monc.monmap,
944 &new->monc.monmap->mon_inst[i].addr))
945 break;
946 if (i == new->monc.monmap->num_mon) {
947 dout("mon ip not part of monmap\n");
948 return 0;
949 }
950 dout("mon ip matches existing sb %p\n", sb);
951 } 745 }
952 if (args->sb_flags != other->mount_args->sb_flags) { 746 if ((opt->flags & CEPH_OPT_FSID) &&
747 ceph_fsid_compare(&opt->fsid, &other->client->fsid)) {
748 dout("fsid doesn't match\n");
749 return 0;
750 }
751 if (fsopt->sb_flags != other->mount_options->sb_flags) {
953 dout("flags differ\n"); 752 dout("flags differ\n");
954 return 0; 753 return 0;
955 } 754 }
@@ -961,19 +760,20 @@ static int ceph_compare_super(struct super_block *sb, void *data)
961 */ 760 */
962static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); 761static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
963 762
964static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client) 763static int ceph_register_bdi(struct super_block *sb,
764 struct ceph_fs_client *fsc)
965{ 765{
966 int err; 766 int err;
967 767
968 /* set ra_pages based on rsize mount option? */ 768 /* set ra_pages based on rsize mount option? */
969 if (client->mount_args->rsize >= PAGE_CACHE_SIZE) 769 if (fsc->mount_options->rsize >= PAGE_CACHE_SIZE)
970 client->backing_dev_info.ra_pages = 770 fsc->backing_dev_info.ra_pages =
971 (client->mount_args->rsize + PAGE_CACHE_SIZE - 1) 771 (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1)
972 >> PAGE_SHIFT; 772 >> PAGE_SHIFT;
973 err = bdi_register(&client->backing_dev_info, NULL, "ceph-%d", 773 err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%d",
974 atomic_long_inc_return(&bdi_seq)); 774 atomic_long_inc_return(&bdi_seq));
975 if (!err) 775 if (!err)
976 sb->s_bdi = &client->backing_dev_info; 776 sb->s_bdi = &fsc->backing_dev_info;
977 return err; 777 return err;
978} 778}
979 779
@@ -982,46 +782,52 @@ static int ceph_get_sb(struct file_system_type *fs_type,
982 struct vfsmount *mnt) 782 struct vfsmount *mnt)
983{ 783{
984 struct super_block *sb; 784 struct super_block *sb;
985 struct ceph_client *client; 785 struct ceph_fs_client *fsc;
986 int err; 786 int err;
987 int (*compare_super)(struct super_block *, void *) = ceph_compare_super; 787 int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
988 const char *path = NULL; 788 const char *path = NULL;
989 struct ceph_mount_args *args; 789 struct ceph_mount_options *fsopt = NULL;
790 struct ceph_options *opt = NULL;
990 791
991 dout("ceph_get_sb\n"); 792 dout("ceph_get_sb\n");
992 args = parse_mount_args(flags, data, dev_name, &path); 793 err = parse_mount_options(&fsopt, &opt, flags, data, dev_name, &path);
993 if (IS_ERR(args)) { 794 if (err < 0)
994 err = PTR_ERR(args);
995 goto out_final; 795 goto out_final;
996 }
997 796
998 /* create client (which we may/may not use) */ 797 /* create client (which we may/may not use) */
999 client = ceph_create_client(args); 798 fsc = create_fs_client(fsopt, opt);
1000 if (IS_ERR(client)) { 799 if (IS_ERR(fsc)) {
1001 err = PTR_ERR(client); 800 err = PTR_ERR(fsc);
801 kfree(fsopt);
802 kfree(opt);
1002 goto out_final; 803 goto out_final;
1003 } 804 }
1004 805
1005 if (client->mount_args->flags & CEPH_OPT_NOSHARE) 806 err = ceph_mdsc_init(fsc);
807 if (err < 0)
808 goto out;
809
810 if (ceph_test_opt(fsc->client, NOSHARE))
1006 compare_super = NULL; 811 compare_super = NULL;
1007 sb = sget(fs_type, compare_super, ceph_set_super, client); 812 sb = sget(fs_type, compare_super, ceph_set_super, fsc);
1008 if (IS_ERR(sb)) { 813 if (IS_ERR(sb)) {
1009 err = PTR_ERR(sb); 814 err = PTR_ERR(sb);
1010 goto out; 815 goto out;
1011 } 816 }
1012 817
1013 if (ceph_sb_to_client(sb) != client) { 818 if (ceph_sb_to_client(sb) != fsc) {
1014 ceph_destroy_client(client); 819 ceph_mdsc_destroy(fsc);
1015 client = ceph_sb_to_client(sb); 820 destroy_fs_client(fsc);
1016 dout("get_sb got existing client %p\n", client); 821 fsc = ceph_sb_to_client(sb);
822 dout("get_sb got existing client %p\n", fsc);
1017 } else { 823 } else {
1018 dout("get_sb using new client %p\n", client); 824 dout("get_sb using new client %p\n", fsc);
1019 err = ceph_register_bdi(sb, client); 825 err = ceph_register_bdi(sb, fsc);
1020 if (err < 0) 826 if (err < 0)
1021 goto out_splat; 827 goto out_splat;
1022 } 828 }
1023 829
1024 err = ceph_mount(client, mnt, path); 830 err = ceph_mount(fsc, mnt, path);
1025 if (err < 0) 831 if (err < 0)
1026 goto out_splat; 832 goto out_splat;
1027 dout("root %p inode %p ino %llx.%llx\n", mnt->mnt_root, 833 dout("root %p inode %p ino %llx.%llx\n", mnt->mnt_root,
@@ -1029,12 +835,13 @@ static int ceph_get_sb(struct file_system_type *fs_type,
1029 return 0; 835 return 0;
1030 836
1031out_splat: 837out_splat:
1032 ceph_mdsc_close_sessions(&client->mdsc); 838 ceph_mdsc_close_sessions(fsc->mdsc);
1033 deactivate_locked_super(sb); 839 deactivate_locked_super(sb);
1034 goto out_final; 840 goto out_final;
1035 841
1036out: 842out:
1037 ceph_destroy_client(client); 843 ceph_mdsc_destroy(fsc);
844 destroy_fs_client(fsc);
1038out_final: 845out_final:
1039 dout("ceph_get_sb fail %d\n", err); 846 dout("ceph_get_sb fail %d\n", err);
1040 return err; 847 return err;
@@ -1042,11 +849,12 @@ out_final:
1042 849
1043static void ceph_kill_sb(struct super_block *s) 850static void ceph_kill_sb(struct super_block *s)
1044{ 851{
1045 struct ceph_client *client = ceph_sb_to_client(s); 852 struct ceph_fs_client *fsc = ceph_sb_to_client(s);
1046 dout("kill_sb %p\n", s); 853 dout("kill_sb %p\n", s);
1047 ceph_mdsc_pre_umount(&client->mdsc); 854 ceph_mdsc_pre_umount(fsc->mdsc);
1048 kill_anon_super(s); /* will call put_super after sb is r/o */ 855 kill_anon_super(s); /* will call put_super after sb is r/o */
1049 ceph_destroy_client(client); 856 ceph_mdsc_destroy(fsc);
857 destroy_fs_client(fsc);
1050} 858}
1051 859
1052static struct file_system_type ceph_fs_type = { 860static struct file_system_type ceph_fs_type = {
@@ -1062,36 +870,20 @@ static struct file_system_type ceph_fs_type = {
1062 870
1063static int __init init_ceph(void) 871static int __init init_ceph(void)
1064{ 872{
1065 int ret = 0; 873 int ret = init_caches();
1066
1067 ret = ceph_debugfs_init();
1068 if (ret < 0)
1069 goto out;
1070
1071 ret = ceph_msgr_init();
1072 if (ret < 0)
1073 goto out_debugfs;
1074
1075 ret = init_caches();
1076 if (ret) 874 if (ret)
1077 goto out_msgr; 875 goto out;
1078 876
1079 ret = register_filesystem(&ceph_fs_type); 877 ret = register_filesystem(&ceph_fs_type);
1080 if (ret) 878 if (ret)
1081 goto out_icache; 879 goto out_icache;
1082 880
1083 pr_info("loaded (mon/mds/osd proto %d/%d/%d, osdmap %d/%d %d/%d)\n", 881 pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL);
1084 CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL, 882
1085 CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT,
1086 CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT);
1087 return 0; 883 return 0;
1088 884
1089out_icache: 885out_icache:
1090 destroy_caches(); 886 destroy_caches();
1091out_msgr:
1092 ceph_msgr_exit();
1093out_debugfs:
1094 ceph_debugfs_cleanup();
1095out: 887out:
1096 return ret; 888 return ret;
1097} 889}
@@ -1101,8 +893,6 @@ static void __exit exit_ceph(void)
1101 dout("exit_ceph\n"); 893 dout("exit_ceph\n");
1102 unregister_filesystem(&ceph_fs_type); 894 unregister_filesystem(&ceph_fs_type);
1103 destroy_caches(); 895 destroy_caches();
1104 ceph_msgr_exit();
1105 ceph_debugfs_cleanup();
1106} 896}
1107 897
1108module_init(init_ceph); 898module_init(init_ceph);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index b87638e84c4b..1886294e12f7 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -1,7 +1,7 @@
1#ifndef _FS_CEPH_SUPER_H 1#ifndef _FS_CEPH_SUPER_H
2#define _FS_CEPH_SUPER_H 2#define _FS_CEPH_SUPER_H
3 3
4#include "ceph_debug.h" 4#include <linux/ceph/ceph_debug.h>
5 5
6#include <asm/unaligned.h> 6#include <asm/unaligned.h>
7#include <linux/backing-dev.h> 7#include <linux/backing-dev.h>
@@ -14,13 +14,7 @@
14#include <linux/writeback.h> 14#include <linux/writeback.h>
15#include <linux/slab.h> 15#include <linux/slab.h>
16 16
17#include "types.h" 17#include <linux/ceph/libceph.h>
18#include "messenger.h"
19#include "msgpool.h"
20#include "mon_client.h"
21#include "mds_client.h"
22#include "osd_client.h"
23#include "ceph_fs.h"
24 18
25/* f_type in struct statfs */ 19/* f_type in struct statfs */
26#define CEPH_SUPER_MAGIC 0x00c36400 20#define CEPH_SUPER_MAGIC 0x00c36400
@@ -30,42 +24,25 @@
30#define CEPH_BLOCK_SHIFT 20 /* 1 MB */ 24#define CEPH_BLOCK_SHIFT 20 /* 1 MB */
31#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) 25#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT)
32 26
33/* 27#define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */
34 * Supported features 28#define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
35 */ 29#define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
36#define CEPH_FEATURE_SUPPORTED CEPH_FEATURE_NOSRCADDR | CEPH_FEATURE_FLOCK
37#define CEPH_FEATURE_REQUIRED CEPH_FEATURE_NOSRCADDR
38 30
39/* 31#define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES)
40 * mount options
41 */
42#define CEPH_OPT_FSID (1<<0)
43#define CEPH_OPT_NOSHARE (1<<1) /* don't share client with other sbs */
44#define CEPH_OPT_MYIP (1<<2) /* specified my ip */
45#define CEPH_OPT_DIRSTAT (1<<4) /* funky `cat dirname` for stats */
46#define CEPH_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
47#define CEPH_OPT_NOCRC (1<<6) /* no data crc on writes */
48#define CEPH_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
49 32
50#define CEPH_OPT_DEFAULT (CEPH_OPT_RBYTES) 33#define ceph_set_mount_opt(fsc, opt) \
34 (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt;
35#define ceph_test_mount_opt(fsc, opt) \
36 (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt))
51 37
52#define ceph_set_opt(client, opt) \ 38#define CEPH_MAX_READDIR_DEFAULT 1024
53 (client)->mount_args->flags |= CEPH_OPT_##opt; 39#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024)
54#define ceph_test_opt(client, opt) \ 40#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
55 (!!((client)->mount_args->flags & CEPH_OPT_##opt))
56 41
57 42struct ceph_mount_options {
58struct ceph_mount_args {
59 int sb_flags;
60 int flags; 43 int flags;
61 struct ceph_fsid fsid; 44 int sb_flags;
62 struct ceph_entity_addr my_addr; 45
63 int num_mon;
64 struct ceph_entity_addr *mon_addr;
65 int mount_timeout;
66 int osd_idle_ttl;
67 int osd_timeout;
68 int osd_keepalive_timeout;
69 int wsize; 46 int wsize;
70 int rsize; /* max readahead */ 47 int rsize; /* max readahead */
71 int congestion_kb; /* max writeback in flight */ 48 int congestion_kb; /* max writeback in flight */
@@ -73,82 +50,25 @@ struct ceph_mount_args {
73 int cap_release_safety; 50 int cap_release_safety;
74 int max_readdir; /* max readdir result (entires) */ 51 int max_readdir; /* max readdir result (entires) */
75 int max_readdir_bytes; /* max readdir result (bytes) */ 52 int max_readdir_bytes; /* max readdir result (bytes) */
76 char *snapdir_name; /* default ".snap" */
77 char *name;
78 char *secret;
79};
80 53
81/* 54 /*
82 * defaults 55 * everything above this point can be memcmp'd; everything below
83 */ 56 * is handled in compare_mount_options()
84#define CEPH_MOUNT_TIMEOUT_DEFAULT 60 57 */
85#define CEPH_OSD_TIMEOUT_DEFAULT 60 /* seconds */
86#define CEPH_OSD_KEEPALIVE_DEFAULT 5
87#define CEPH_OSD_IDLE_TTL_DEFAULT 60
88#define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */
89#define CEPH_MAX_READDIR_DEFAULT 1024
90#define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024)
91
92#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
93#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024)
94
95#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
96#define CEPH_AUTH_NAME_DEFAULT "guest"
97/*
98 * Delay telling the MDS we no longer want caps, in case we reopen
99 * the file. Delay a minimum amount of time, even if we send a cap
100 * message for some other reason. Otherwise, take the oppotunity to
101 * update the mds to avoid sending another message later.
102 */
103#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */
104#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */
105
106#define CEPH_CAP_RELEASE_SAFETY_DEFAULT (CEPH_CAPS_PER_RELEASE * 4)
107
108/* mount state */
109enum {
110 CEPH_MOUNT_MOUNTING,
111 CEPH_MOUNT_MOUNTED,
112 CEPH_MOUNT_UNMOUNTING,
113 CEPH_MOUNT_UNMOUNTED,
114 CEPH_MOUNT_SHUTDOWN,
115};
116
117/*
118 * subtract jiffies
119 */
120static inline unsigned long time_sub(unsigned long a, unsigned long b)
121{
122 BUG_ON(time_after(b, a));
123 return (long)a - (long)b;
124}
125
126/*
127 * per-filesystem client state
128 *
129 * possibly shared by multiple mount points, if they are
130 * mounting the same ceph filesystem/cluster.
131 */
132struct ceph_client {
133 struct ceph_fsid fsid;
134 bool have_fsid;
135 58
136 struct mutex mount_mutex; /* serialize mount attempts */ 59 char *snapdir_name; /* default ".snap" */
137 struct ceph_mount_args *mount_args; 60};
138 61
62struct ceph_fs_client {
139 struct super_block *sb; 63 struct super_block *sb;
140 64
141 unsigned long mount_state; 65 struct ceph_mount_options *mount_options;
142 wait_queue_head_t auth_wq; 66 struct ceph_client *client;
143
144 int auth_err;
145 67
68 unsigned long mount_state;
146 int min_caps; /* min caps i added */ 69 int min_caps; /* min caps i added */
147 70
148 struct ceph_messenger *msgr; /* messenger instance */ 71 struct ceph_mds_client *mdsc;
149 struct ceph_mon_client monc;
150 struct ceph_mds_client mdsc;
151 struct ceph_osd_client osdc;
152 72
153 /* writeback */ 73 /* writeback */
154 mempool_t *wb_pagevec_pool; 74 mempool_t *wb_pagevec_pool;
@@ -160,14 +80,14 @@ struct ceph_client {
160 struct backing_dev_info backing_dev_info; 80 struct backing_dev_info backing_dev_info;
161 81
162#ifdef CONFIG_DEBUG_FS 82#ifdef CONFIG_DEBUG_FS
163 struct dentry *debugfs_monmap; 83 struct dentry *debugfs_dentry_lru, *debugfs_caps;
164 struct dentry *debugfs_mdsmap, *debugfs_osdmap;
165 struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps;
166 struct dentry *debugfs_congestion_kb; 84 struct dentry *debugfs_congestion_kb;
167 struct dentry *debugfs_bdi; 85 struct dentry *debugfs_bdi;
86 struct dentry *debugfs_mdsc, *debugfs_mdsmap;
168#endif 87#endif
169}; 88};
170 89
90
171/* 91/*
172 * File i/o capability. This tracks shared state with the metadata 92 * File i/o capability. This tracks shared state with the metadata
173 * server that allows us to cache or writeback attributes or to read 93 * server that allows us to cache or writeback attributes or to read
@@ -275,6 +195,20 @@ struct ceph_inode_xattr {
275 int should_free_val; 195 int should_free_val;
276}; 196};
277 197
198/*
199 * Ceph dentry state
200 */
201struct ceph_dentry_info {
202 struct ceph_mds_session *lease_session;
203 u32 lease_gen, lease_shared_gen;
204 u32 lease_seq;
205 unsigned long lease_renew_after, lease_renew_from;
206 struct list_head lru;
207 struct dentry *dentry;
208 u64 time;
209 u64 offset;
210};
211
278struct ceph_inode_xattrs_info { 212struct ceph_inode_xattrs_info {
279 /* 213 /*
280 * (still encoded) xattr blob. we avoid the overhead of parsing 214 * (still encoded) xattr blob. we avoid the overhead of parsing
@@ -296,11 +230,6 @@ struct ceph_inode_xattrs_info {
296/* 230/*
297 * Ceph inode. 231 * Ceph inode.
298 */ 232 */
299#define CEPH_I_COMPLETE 1 /* we have complete directory cached */
300#define CEPH_I_NODELAY 4 /* do not delay cap release */
301#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */
302#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */
303
304struct ceph_inode_info { 233struct ceph_inode_info {
305 struct ceph_vino i_vino; /* ceph ino + snap */ 234 struct ceph_vino i_vino; /* ceph ino + snap */
306 235
@@ -391,6 +320,63 @@ static inline struct ceph_inode_info *ceph_inode(struct inode *inode)
391 return container_of(inode, struct ceph_inode_info, vfs_inode); 320 return container_of(inode, struct ceph_inode_info, vfs_inode);
392} 321}
393 322
323static inline struct ceph_vino ceph_vino(struct inode *inode)
324{
325 return ceph_inode(inode)->i_vino;
326}
327
328/*
329 * ino_t is <64 bits on many architectures, blech.
330 *
331 * don't include snap in ino hash, at least for now.
332 */
333static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
334{
335 ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */
336#if BITS_PER_LONG == 32
337 ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8;
338 if (!ino)
339 ino = 1;
340#endif
341 return ino;
342}
343
344/* for printf-style formatting */
345#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
346
347static inline u64 ceph_ino(struct inode *inode)
348{
349 return ceph_inode(inode)->i_vino.ino;
350}
351static inline u64 ceph_snap(struct inode *inode)
352{
353 return ceph_inode(inode)->i_vino.snap;
354}
355
356static inline int ceph_ino_compare(struct inode *inode, void *data)
357{
358 struct ceph_vino *pvino = (struct ceph_vino *)data;
359 struct ceph_inode_info *ci = ceph_inode(inode);
360 return ci->i_vino.ino == pvino->ino &&
361 ci->i_vino.snap == pvino->snap;
362}
363
364static inline struct inode *ceph_find_inode(struct super_block *sb,
365 struct ceph_vino vino)
366{
367 ino_t t = ceph_vino_to_ino(vino);
368 return ilookup5(sb, t, ceph_ino_compare, &vino);
369}
370
371
372/*
373 * Ceph inode.
374 */
375#define CEPH_I_COMPLETE 1 /* we have complete directory cached */
376#define CEPH_I_NODELAY 4 /* do not delay cap release */
377#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */
378#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */
379
394static inline void ceph_i_clear(struct inode *inode, unsigned mask) 380static inline void ceph_i_clear(struct inode *inode, unsigned mask)
395{ 381{
396 struct ceph_inode_info *ci = ceph_inode(inode); 382 struct ceph_inode_info *ci = ceph_inode(inode);
@@ -414,8 +400,9 @@ static inline bool ceph_i_test(struct inode *inode, unsigned mask)
414 struct ceph_inode_info *ci = ceph_inode(inode); 400 struct ceph_inode_info *ci = ceph_inode(inode);
415 bool r; 401 bool r;
416 402
417 smp_mb(); 403 spin_lock(&inode->i_lock);
418 r = (ci->i_ceph_flags & mask) == mask; 404 r = (ci->i_ceph_flags & mask) == mask;
405 spin_unlock(&inode->i_lock);
419 return r; 406 return r;
420} 407}
421 408
@@ -432,20 +419,6 @@ extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
432 struct ceph_inode_frag *pfrag, 419 struct ceph_inode_frag *pfrag,
433 int *found); 420 int *found);
434 421
435/*
436 * Ceph dentry state
437 */
438struct ceph_dentry_info {
439 struct ceph_mds_session *lease_session;
440 u32 lease_gen, lease_shared_gen;
441 u32 lease_seq;
442 unsigned long lease_renew_after, lease_renew_from;
443 struct list_head lru;
444 struct dentry *dentry;
445 u64 time;
446 u64 offset;
447};
448
449static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry) 422static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
450{ 423{
451 return (struct ceph_dentry_info *)dentry->d_fsdata; 424 return (struct ceph_dentry_info *)dentry->d_fsdata;
@@ -456,22 +429,6 @@ static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
456 return ((loff_t)frag << 32) | (loff_t)off; 429 return ((loff_t)frag << 32) | (loff_t)off;
457} 430}
458 431
459/*
460 * ino_t is <64 bits on many architectures, blech.
461 *
462 * don't include snap in ino hash, at least for now.
463 */
464static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
465{
466 ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */
467#if BITS_PER_LONG == 32
468 ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8;
469 if (!ino)
470 ino = 1;
471#endif
472 return ino;
473}
474
475static inline int ceph_set_ino_cb(struct inode *inode, void *data) 432static inline int ceph_set_ino_cb(struct inode *inode, void *data)
476{ 433{
477 ceph_inode(inode)->i_vino = *(struct ceph_vino *)data; 434 ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
@@ -479,39 +436,6 @@ static inline int ceph_set_ino_cb(struct inode *inode, void *data)
479 return 0; 436 return 0;
480} 437}
481 438
482static inline struct ceph_vino ceph_vino(struct inode *inode)
483{
484 return ceph_inode(inode)->i_vino;
485}
486
487/* for printf-style formatting */
488#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
489
490static inline u64 ceph_ino(struct inode *inode)
491{
492 return ceph_inode(inode)->i_vino.ino;
493}
494static inline u64 ceph_snap(struct inode *inode)
495{
496 return ceph_inode(inode)->i_vino.snap;
497}
498
499static inline int ceph_ino_compare(struct inode *inode, void *data)
500{
501 struct ceph_vino *pvino = (struct ceph_vino *)data;
502 struct ceph_inode_info *ci = ceph_inode(inode);
503 return ci->i_vino.ino == pvino->ino &&
504 ci->i_vino.snap == pvino->snap;
505}
506
507static inline struct inode *ceph_find_inode(struct super_block *sb,
508 struct ceph_vino vino)
509{
510 ino_t t = ceph_vino_to_ino(vino);
511 return ilookup5(sb, t, ceph_ino_compare, &vino);
512}
513
514
515/* 439/*
516 * caps helpers 440 * caps helpers
517 */ 441 */
@@ -576,18 +500,18 @@ extern int ceph_reserve_caps(struct ceph_mds_client *mdsc,
576 struct ceph_cap_reservation *ctx, int need); 500 struct ceph_cap_reservation *ctx, int need);
577extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc, 501extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc,
578 struct ceph_cap_reservation *ctx); 502 struct ceph_cap_reservation *ctx);
579extern void ceph_reservation_status(struct ceph_client *client, 503extern void ceph_reservation_status(struct ceph_fs_client *client,
580 int *total, int *avail, int *used, 504 int *total, int *avail, int *used,
581 int *reserved, int *min); 505 int *reserved, int *min);
582 506
583static inline struct ceph_client *ceph_inode_to_client(struct inode *inode) 507static inline struct ceph_fs_client *ceph_inode_to_client(struct inode *inode)
584{ 508{
585 return (struct ceph_client *)inode->i_sb->s_fs_info; 509 return (struct ceph_fs_client *)inode->i_sb->s_fs_info;
586} 510}
587 511
588static inline struct ceph_client *ceph_sb_to_client(struct super_block *sb) 512static inline struct ceph_fs_client *ceph_sb_to_client(struct super_block *sb)
589{ 513{
590 return (struct ceph_client *)sb->s_fs_info; 514 return (struct ceph_fs_client *)sb->s_fs_info;
591} 515}
592 516
593 517
@@ -617,51 +541,6 @@ struct ceph_file_info {
617 541
618 542
619/* 543/*
620 * snapshots
621 */
622
623/*
624 * A "snap context" is the set of existing snapshots when we
625 * write data. It is used by the OSD to guide its COW behavior.
626 *
627 * The ceph_snap_context is refcounted, and attached to each dirty
628 * page, indicating which context the dirty data belonged when it was
629 * dirtied.
630 */
631struct ceph_snap_context {
632 atomic_t nref;
633 u64 seq;
634 int num_snaps;
635 u64 snaps[];
636};
637
638static inline struct ceph_snap_context *
639ceph_get_snap_context(struct ceph_snap_context *sc)
640{
641 /*
642 printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
643 atomic_read(&sc->nref)+1);
644 */
645 if (sc)
646 atomic_inc(&sc->nref);
647 return sc;
648}
649
650static inline void ceph_put_snap_context(struct ceph_snap_context *sc)
651{
652 if (!sc)
653 return;
654 /*
655 printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
656 atomic_read(&sc->nref)-1);
657 */
658 if (atomic_dec_and_test(&sc->nref)) {
659 /*printk(" deleting snap_context %p\n", sc);*/
660 kfree(sc);
661 }
662}
663
664/*
665 * A "snap realm" describes a subset of the file hierarchy sharing 544 * A "snap realm" describes a subset of the file hierarchy sharing
666 * the same set of snapshots that apply to it. The realms themselves 545 * the same set of snapshots that apply to it. The realms themselves
667 * are organized into a hierarchy, such that children inherit (some of) 546 * are organized into a hierarchy, such that children inherit (some of)
@@ -699,16 +578,33 @@ struct ceph_snap_realm {
699 spinlock_t inodes_with_caps_lock; 578 spinlock_t inodes_with_caps_lock;
700}; 579};
701 580
702 581static inline int default_congestion_kb(void)
703
704/*
705 * calculate the number of pages a given length and offset map onto,
706 * if we align the data.
707 */
708static inline int calc_pages_for(u64 off, u64 len)
709{ 582{
710 return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) - 583 int congestion_kb;
711 (off >> PAGE_CACHE_SHIFT); 584
585 /*
586 * Copied from NFS
587 *
588 * congestion size, scale with available memory.
589 *
590 * 64MB: 8192k
591 * 128MB: 11585k
592 * 256MB: 16384k
593 * 512MB: 23170k
594 * 1GB: 32768k
595 * 2GB: 46340k
596 * 4GB: 65536k
597 * 8GB: 92681k
598 * 16GB: 131072k
599 *
600 * This allows larger machines to have larger/more transfers.
601 * Limit the default to 256M
602 */
603 congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
604 if (congestion_kb > 256*1024)
605 congestion_kb = 256*1024;
606
607 return congestion_kb;
712} 608}
713 609
714 610
@@ -741,16 +637,6 @@ static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
741 ci_item)->writing; 637 ci_item)->writing;
742} 638}
743 639
744
745/* super.c */
746extern struct kmem_cache *ceph_inode_cachep;
747extern struct kmem_cache *ceph_cap_cachep;
748extern struct kmem_cache *ceph_dentry_cachep;
749extern struct kmem_cache *ceph_file_cachep;
750
751extern const char *ceph_msg_type_name(int type);
752extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
753
754/* inode.c */ 640/* inode.c */
755extern const struct inode_operations ceph_file_iops; 641extern const struct inode_operations ceph_file_iops;
756 642
@@ -857,12 +743,18 @@ extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
857/* file.c */ 743/* file.c */
858extern const struct file_operations ceph_file_fops; 744extern const struct file_operations ceph_file_fops;
859extern const struct address_space_operations ceph_aops; 745extern const struct address_space_operations ceph_aops;
746extern int ceph_copy_to_page_vector(struct page **pages,
747 const char *data,
748 loff_t off, size_t len);
749extern int ceph_copy_from_page_vector(struct page **pages,
750 char *data,
751 loff_t off, size_t len);
752extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags);
860extern int ceph_open(struct inode *inode, struct file *file); 753extern int ceph_open(struct inode *inode, struct file *file);
861extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry, 754extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
862 struct nameidata *nd, int mode, 755 struct nameidata *nd, int mode,
863 int locked_dir); 756 int locked_dir);
864extern int ceph_release(struct inode *inode, struct file *filp); 757extern int ceph_release(struct inode *inode, struct file *filp);
865extern void ceph_release_page_vector(struct page **pages, int num_pages);
866 758
867/* dir.c */ 759/* dir.c */
868extern const struct file_operations ceph_dir_fops; 760extern const struct file_operations ceph_dir_fops;
@@ -892,12 +784,6 @@ extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
892/* export.c */ 784/* export.c */
893extern const struct export_operations ceph_export_ops; 785extern const struct export_operations ceph_export_ops;
894 786
895/* debugfs.c */
896extern int ceph_debugfs_init(void);
897extern void ceph_debugfs_cleanup(void);
898extern int ceph_debugfs_client_init(struct ceph_client *client);
899extern void ceph_debugfs_client_cleanup(struct ceph_client *client);
900
901/* locks.c */ 787/* locks.c */
902extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl); 788extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl);
903extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl); 789extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl);
@@ -914,4 +800,8 @@ static inline struct inode *get_dentry_parent_inode(struct dentry *dentry)
914 return NULL; 800 return NULL;
915} 801}
916 802
803/* debugfs.c */
804extern int ceph_fs_debugfs_init(struct ceph_fs_client *client);
805extern void ceph_fs_debugfs_cleanup(struct ceph_fs_client *client);
806
917#endif /* _FS_CEPH_SUPER_H */ 807#endif /* _FS_CEPH_SUPER_H */
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 9578af610b73..6e12a6ba5f79 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -1,6 +1,9 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2
2#include "super.h" 3#include "super.h"
3#include "decode.h" 4#include "mds_client.h"
5
6#include <linux/ceph/decode.h>
4 7
5#include <linux/xattr.h> 8#include <linux/xattr.h>
6#include <linux/slab.h> 9#include <linux/slab.h>
@@ -620,12 +623,12 @@ out:
620static int ceph_sync_setxattr(struct dentry *dentry, const char *name, 623static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
621 const char *value, size_t size, int flags) 624 const char *value, size_t size, int flags)
622{ 625{
623 struct ceph_client *client = ceph_sb_to_client(dentry->d_sb); 626 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
624 struct inode *inode = dentry->d_inode; 627 struct inode *inode = dentry->d_inode;
625 struct ceph_inode_info *ci = ceph_inode(inode); 628 struct ceph_inode_info *ci = ceph_inode(inode);
626 struct inode *parent_inode = dentry->d_parent->d_inode; 629 struct inode *parent_inode = dentry->d_parent->d_inode;
627 struct ceph_mds_request *req; 630 struct ceph_mds_request *req;
628 struct ceph_mds_client *mdsc = &client->mdsc; 631 struct ceph_mds_client *mdsc = fsc->mdsc;
629 int err; 632 int err;
630 int i, nr_pages; 633 int i, nr_pages;
631 struct page **pages = NULL; 634 struct page **pages = NULL;
@@ -713,10 +716,9 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
713 716
714 /* preallocate memory for xattr name, value, index node */ 717 /* preallocate memory for xattr name, value, index node */
715 err = -ENOMEM; 718 err = -ENOMEM;
716 newname = kmalloc(name_len + 1, GFP_NOFS); 719 newname = kmemdup(name, name_len + 1, GFP_NOFS);
717 if (!newname) 720 if (!newname)
718 goto out; 721 goto out;
719 memcpy(newname, name, name_len + 1);
720 722
721 if (val_len) { 723 if (val_len) {
722 newval = kmalloc(val_len + 1, GFP_NOFS); 724 newval = kmalloc(val_len + 1, GFP_NOFS);
@@ -777,8 +779,8 @@ out:
777 779
778static int ceph_send_removexattr(struct dentry *dentry, const char *name) 780static int ceph_send_removexattr(struct dentry *dentry, const char *name)
779{ 781{
780 struct ceph_client *client = ceph_sb_to_client(dentry->d_sb); 782 struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
781 struct ceph_mds_client *mdsc = &client->mdsc; 783 struct ceph_mds_client *mdsc = fsc->mdsc;
782 struct inode *inode = dentry->d_inode; 784 struct inode *inode = dentry->d_inode;
783 struct inode *parent_inode = dentry->d_parent->d_inode; 785 struct inode *parent_inode = dentry->d_parent->d_inode;
784 struct ceph_mds_request *req; 786 struct ceph_mds_request *req;
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index cc9665522148..c465ae066c62 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -1,6 +1,6 @@
1config GFS2_FS 1config GFS2_FS
2 tristate "GFS2 file system support" 2 tristate "GFS2 file system support"
3 depends on EXPERIMENTAL && (64BIT || LBDAF) 3 depends on (64BIT || LBDAF)
4 select DLM if GFS2_FS_LOCKING_DLM 4 select DLM if GFS2_FS_LOCKING_DLM
5 select CONFIGFS_FS if GFS2_FS_LOCKING_DLM 5 select CONFIGFS_FS if GFS2_FS_LOCKING_DLM
6 select SYSFS if GFS2_FS_LOCKING_DLM 6 select SYSFS if GFS2_FS_LOCKING_DLM
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 194fe16d8418..6b24afb96aae 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -36,8 +36,8 @@
36#include "glops.h" 36#include "glops.h"
37 37
38 38
39static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page, 39void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
40 unsigned int from, unsigned int to) 40 unsigned int from, unsigned int to)
41{ 41{
42 struct buffer_head *head = page_buffers(page); 42 struct buffer_head *head = page_buffers(page);
43 unsigned int bsize = head->b_size; 43 unsigned int bsize = head->b_size;
@@ -615,7 +615,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
615 unsigned int data_blocks = 0, ind_blocks = 0, rblocks; 615 unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
616 int alloc_required; 616 int alloc_required;
617 int error = 0; 617 int error = 0;
618 struct gfs2_alloc *al; 618 struct gfs2_alloc *al = NULL;
619 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 619 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
620 unsigned from = pos & (PAGE_CACHE_SIZE - 1); 620 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
621 unsigned to = from + len; 621 unsigned to = from + len;
@@ -663,6 +663,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
663 rblocks += RES_STATFS + RES_QUOTA; 663 rblocks += RES_STATFS + RES_QUOTA;
664 if (&ip->i_inode == sdp->sd_rindex) 664 if (&ip->i_inode == sdp->sd_rindex)
665 rblocks += 2 * RES_STATFS; 665 rblocks += 2 * RES_STATFS;
666 if (alloc_required)
667 rblocks += gfs2_rg_blocks(al);
666 668
667 error = gfs2_trans_begin(sdp, rblocks, 669 error = gfs2_trans_begin(sdp, rblocks,
668 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize); 670 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
@@ -696,13 +698,11 @@ out:
696 698
697 page_cache_release(page); 699 page_cache_release(page);
698 700
699 /* 701 gfs2_trans_end(sdp);
700 * XXX(truncate): the call below should probably be replaced with
701 * a call to the gfs2-specific truncate blocks helper to actually
702 * release disk blocks..
703 */
704 if (pos + len > ip->i_inode.i_size) 702 if (pos + len > ip->i_inode.i_size)
705 truncate_setsize(&ip->i_inode, ip->i_inode.i_size); 703 gfs2_trim_blocks(&ip->i_inode);
704 goto out_trans_fail;
705
706out_endtrans: 706out_endtrans:
707 gfs2_trans_end(sdp); 707 gfs2_trans_end(sdp);
708out_trans_fail: 708out_trans_fail:
@@ -802,10 +802,8 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
802 page_cache_release(page); 802 page_cache_release(page);
803 803
804 if (copied) { 804 if (copied) {
805 if (inode->i_size < to) { 805 if (inode->i_size < to)
806 i_size_write(inode, to); 806 i_size_write(inode, to);
807 ip->i_disksize = inode->i_size;
808 }
809 gfs2_dinode_out(ip, di); 807 gfs2_dinode_out(ip, di);
810 mark_inode_dirty(inode); 808 mark_inode_dirty(inode);
811 } 809 }
@@ -876,8 +874,6 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
876 874
877 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); 875 ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
878 if (ret > 0) { 876 if (ret > 0) {
879 if (inode->i_size > ip->i_disksize)
880 ip->i_disksize = inode->i_size;
881 gfs2_dinode_out(ip, dibh->b_data); 877 gfs2_dinode_out(ip, dibh->b_data);
882 mark_inode_dirty(inode); 878 mark_inode_dirty(inode);
883 } 879 }
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 6f482809d1a3..5476c066d4ee 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -50,7 +50,7 @@ struct strip_mine {
50 * @ip: the inode 50 * @ip: the inode
51 * @dibh: the dinode buffer 51 * @dibh: the dinode buffer
52 * @block: the block number that was allocated 52 * @block: the block number that was allocated
53 * @private: any locked page held by the caller process 53 * @page: The (optional) page. This is looked up if @page is NULL
54 * 54 *
55 * Returns: errno 55 * Returns: errno
56 */ 56 */
@@ -109,8 +109,7 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
109/** 109/**
110 * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big 110 * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
111 * @ip: The GFS2 inode to unstuff 111 * @ip: The GFS2 inode to unstuff
112 * @unstuffer: the routine that handles unstuffing a non-zero length file 112 * @page: The (optional) page. This is looked up if the @page is NULL
113 * @private: private data for the unstuffer
114 * 113 *
115 * This routine unstuffs a dinode and returns it to a "normal" state such 114 * This routine unstuffs a dinode and returns it to a "normal" state such
116 * that the height can be grown in the traditional way. 115 * that the height can be grown in the traditional way.
@@ -132,7 +131,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
132 if (error) 131 if (error)
133 goto out; 132 goto out;
134 133
135 if (ip->i_disksize) { 134 if (i_size_read(&ip->i_inode)) {
136 /* Get a free block, fill it with the stuffed data, 135 /* Get a free block, fill it with the stuffed data,
137 and write it out to disk */ 136 and write it out to disk */
138 137
@@ -161,7 +160,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
161 di = (struct gfs2_dinode *)dibh->b_data; 160 di = (struct gfs2_dinode *)dibh->b_data;
162 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); 161 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
163 162
164 if (ip->i_disksize) { 163 if (i_size_read(&ip->i_inode)) {
165 *(__be64 *)(di + 1) = cpu_to_be64(block); 164 *(__be64 *)(di + 1) = cpu_to_be64(block);
166 gfs2_add_inode_blocks(&ip->i_inode, 1); 165 gfs2_add_inode_blocks(&ip->i_inode, 1);
167 di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode)); 166 di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
@@ -885,83 +884,14 @@ out:
885} 884}
886 885
887/** 886/**
888 * do_grow - Make a file look bigger than it is
889 * @ip: the inode
890 * @size: the size to set the file to
891 *
892 * Called with an exclusive lock on @ip.
893 *
894 * Returns: errno
895 */
896
897static int do_grow(struct gfs2_inode *ip, u64 size)
898{
899 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
900 struct gfs2_alloc *al;
901 struct buffer_head *dibh;
902 int error;
903
904 al = gfs2_alloc_get(ip);
905 if (!al)
906 return -ENOMEM;
907
908 error = gfs2_quota_lock_check(ip);
909 if (error)
910 goto out;
911
912 al->al_requested = sdp->sd_max_height + RES_DATA;
913
914 error = gfs2_inplace_reserve(ip);
915 if (error)
916 goto out_gunlock_q;
917
918 error = gfs2_trans_begin(sdp,
919 sdp->sd_max_height + al->al_rgd->rd_length +
920 RES_JDATA + RES_DINODE + RES_STATFS + RES_QUOTA, 0);
921 if (error)
922 goto out_ipres;
923
924 error = gfs2_meta_inode_buffer(ip, &dibh);
925 if (error)
926 goto out_end_trans;
927
928 if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
929 if (gfs2_is_stuffed(ip)) {
930 error = gfs2_unstuff_dinode(ip, NULL);
931 if (error)
932 goto out_brelse;
933 }
934 }
935
936 ip->i_disksize = size;
937 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
938 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
939 gfs2_dinode_out(ip, dibh->b_data);
940
941out_brelse:
942 brelse(dibh);
943out_end_trans:
944 gfs2_trans_end(sdp);
945out_ipres:
946 gfs2_inplace_release(ip);
947out_gunlock_q:
948 gfs2_quota_unlock(ip);
949out:
950 gfs2_alloc_put(ip);
951 return error;
952}
953
954
955/**
956 * gfs2_block_truncate_page - Deal with zeroing out data for truncate 887 * gfs2_block_truncate_page - Deal with zeroing out data for truncate
957 * 888 *
958 * This is partly borrowed from ext3. 889 * This is partly borrowed from ext3.
959 */ 890 */
960static int gfs2_block_truncate_page(struct address_space *mapping) 891static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from)
961{ 892{
962 struct inode *inode = mapping->host; 893 struct inode *inode = mapping->host;
963 struct gfs2_inode *ip = GFS2_I(inode); 894 struct gfs2_inode *ip = GFS2_I(inode);
964 loff_t from = inode->i_size;
965 unsigned long index = from >> PAGE_CACHE_SHIFT; 895 unsigned long index = from >> PAGE_CACHE_SHIFT;
966 unsigned offset = from & (PAGE_CACHE_SIZE-1); 896 unsigned offset = from & (PAGE_CACHE_SIZE-1);
967 unsigned blocksize, iblock, length, pos; 897 unsigned blocksize, iblock, length, pos;
@@ -1023,9 +953,11 @@ unlock:
1023 return err; 953 return err;
1024} 954}
1025 955
1026static int trunc_start(struct gfs2_inode *ip, u64 size) 956static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize)
1027{ 957{
1028 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 958 struct gfs2_inode *ip = GFS2_I(inode);
959 struct gfs2_sbd *sdp = GFS2_SB(inode);
960 struct address_space *mapping = inode->i_mapping;
1029 struct buffer_head *dibh; 961 struct buffer_head *dibh;
1030 int journaled = gfs2_is_jdata(ip); 962 int journaled = gfs2_is_jdata(ip);
1031 int error; 963 int error;
@@ -1039,31 +971,26 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
1039 if (error) 971 if (error)
1040 goto out; 972 goto out;
1041 973
974 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
975
1042 if (gfs2_is_stuffed(ip)) { 976 if (gfs2_is_stuffed(ip)) {
1043 u64 dsize = size + sizeof(struct gfs2_dinode); 977 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize);
1044 ip->i_disksize = size;
1045 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
1046 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1047 gfs2_dinode_out(ip, dibh->b_data);
1048 if (dsize > dibh->b_size)
1049 dsize = dibh->b_size;
1050 gfs2_buffer_clear_tail(dibh, dsize);
1051 error = 1;
1052 } else { 978 } else {
1053 if (size & (u64)(sdp->sd_sb.sb_bsize - 1)) 979 if (newsize & (u64)(sdp->sd_sb.sb_bsize - 1)) {
1054 error = gfs2_block_truncate_page(ip->i_inode.i_mapping); 980 error = gfs2_block_truncate_page(mapping, newsize);
1055 981 if (error)
1056 if (!error) { 982 goto out_brelse;
1057 ip->i_disksize = size;
1058 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
1059 ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
1060 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1061 gfs2_dinode_out(ip, dibh->b_data);
1062 } 983 }
984 ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
1063 } 985 }
1064 986
1065 brelse(dibh); 987 i_size_write(inode, newsize);
988 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
989 gfs2_dinode_out(ip, dibh->b_data);
1066 990
991 truncate_pagecache(inode, oldsize, newsize);
992out_brelse:
993 brelse(dibh);
1067out: 994out:
1068 gfs2_trans_end(sdp); 995 gfs2_trans_end(sdp);
1069 return error; 996 return error;
@@ -1123,7 +1050,7 @@ static int trunc_end(struct gfs2_inode *ip)
1123 if (error) 1050 if (error)
1124 goto out; 1051 goto out;
1125 1052
1126 if (!ip->i_disksize) { 1053 if (!i_size_read(&ip->i_inode)) {
1127 ip->i_height = 0; 1054 ip->i_height = 0;
1128 ip->i_goal = ip->i_no_addr; 1055 ip->i_goal = ip->i_no_addr;
1129 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); 1056 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
@@ -1143,92 +1070,154 @@ out:
1143 1070
1144/** 1071/**
1145 * do_shrink - make a file smaller 1072 * do_shrink - make a file smaller
1146 * @ip: the inode 1073 * @inode: the inode
1147 * @size: the size to make the file 1074 * @oldsize: the current inode size
1148 * @truncator: function to truncate the last partial block 1075 * @newsize: the size to make the file
1149 * 1076 *
1150 * Called with an exclusive lock on @ip. 1077 * Called with an exclusive lock on @inode. The @size must
1078 * be equal to or smaller than the current inode size.
1151 * 1079 *
1152 * Returns: errno 1080 * Returns: errno
1153 */ 1081 */
1154 1082
1155static int do_shrink(struct gfs2_inode *ip, u64 size) 1083static int do_shrink(struct inode *inode, u64 oldsize, u64 newsize)
1156{ 1084{
1085 struct gfs2_inode *ip = GFS2_I(inode);
1157 int error; 1086 int error;
1158 1087
1159 error = trunc_start(ip, size); 1088 error = trunc_start(inode, oldsize, newsize);
1160 if (error < 0) 1089 if (error < 0)
1161 return error; 1090 return error;
1162 if (error > 0) 1091 if (gfs2_is_stuffed(ip))
1163 return 0; 1092 return 0;
1164 1093
1165 error = trunc_dealloc(ip, size); 1094 error = trunc_dealloc(ip, newsize);
1166 if (!error) 1095 if (error == 0)
1167 error = trunc_end(ip); 1096 error = trunc_end(ip);
1168 1097
1169 return error; 1098 return error;
1170} 1099}
1171 1100
1172static int do_touch(struct gfs2_inode *ip, u64 size) 1101void gfs2_trim_blocks(struct inode *inode)
1173{ 1102{
1174 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1103 u64 size = inode->i_size;
1104 int ret;
1105
1106 ret = do_shrink(inode, size, size);
1107 WARN_ON(ret != 0);
1108}
1109
1110/**
1111 * do_grow - Touch and update inode size
1112 * @inode: The inode
1113 * @size: The new size
1114 *
1115 * This function updates the timestamps on the inode and
1116 * may also increase the size of the inode. This function
1117 * must not be called with @size any smaller than the current
1118 * inode size.
1119 *
1120 * Although it is not strictly required to unstuff files here,
1121 * earlier versions of GFS2 have a bug in the stuffed file reading
1122 * code which will result in a buffer overrun if the size is larger
1123 * than the max stuffed file size. In order to prevent this from
1124 * occuring, such files are unstuffed, but in other cases we can
1125 * just update the inode size directly.
1126 *
1127 * Returns: 0 on success, or -ve on error
1128 */
1129
1130static int do_grow(struct inode *inode, u64 size)
1131{
1132 struct gfs2_inode *ip = GFS2_I(inode);
1133 struct gfs2_sbd *sdp = GFS2_SB(inode);
1175 struct buffer_head *dibh; 1134 struct buffer_head *dibh;
1135 struct gfs2_alloc *al = NULL;
1176 int error; 1136 int error;
1177 1137
1178 error = gfs2_trans_begin(sdp, RES_DINODE, 0); 1138 if (gfs2_is_stuffed(ip) &&
1139 (size > (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)))) {
1140 al = gfs2_alloc_get(ip);
1141 if (al == NULL)
1142 return -ENOMEM;
1143
1144 error = gfs2_quota_lock_check(ip);
1145 if (error)
1146 goto do_grow_alloc_put;
1147
1148 al->al_requested = 1;
1149 error = gfs2_inplace_reserve(ip);
1150 if (error)
1151 goto do_grow_qunlock;
1152 }
1153
1154 error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT, 0);
1179 if (error) 1155 if (error)
1180 return error; 1156 goto do_grow_release;
1181 1157
1182 down_write(&ip->i_rw_mutex); 1158 if (al) {
1159 error = gfs2_unstuff_dinode(ip, NULL);
1160 if (error)
1161 goto do_end_trans;
1162 }
1183 1163
1184 error = gfs2_meta_inode_buffer(ip, &dibh); 1164 error = gfs2_meta_inode_buffer(ip, &dibh);
1185 if (error) 1165 if (error)
1186 goto do_touch_out; 1166 goto do_end_trans;
1187 1167
1168 i_size_write(inode, size);
1188 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 1169 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
1189 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 1170 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1190 gfs2_dinode_out(ip, dibh->b_data); 1171 gfs2_dinode_out(ip, dibh->b_data);
1191 brelse(dibh); 1172 brelse(dibh);
1192 1173
1193do_touch_out: 1174do_end_trans:
1194 up_write(&ip->i_rw_mutex);
1195 gfs2_trans_end(sdp); 1175 gfs2_trans_end(sdp);
1176do_grow_release:
1177 if (al) {
1178 gfs2_inplace_release(ip);
1179do_grow_qunlock:
1180 gfs2_quota_unlock(ip);
1181do_grow_alloc_put:
1182 gfs2_alloc_put(ip);
1183 }
1196 return error; 1184 return error;
1197} 1185}
1198 1186
1199/** 1187/**
1200 * gfs2_truncatei - make a file a given size 1188 * gfs2_setattr_size - make a file a given size
1201 * @ip: the inode 1189 * @inode: the inode
1202 * @size: the size to make the file 1190 * @newsize: the size to make the file
1203 * @truncator: function to truncate the last partial block
1204 * 1191 *
1205 * The file size can grow, shrink, or stay the same size. 1192 * The file size can grow, shrink, or stay the same size. This
1193 * is called holding i_mutex and an exclusive glock on the inode
1194 * in question.
1206 * 1195 *
1207 * Returns: errno 1196 * Returns: errno
1208 */ 1197 */
1209 1198
1210int gfs2_truncatei(struct gfs2_inode *ip, u64 size) 1199int gfs2_setattr_size(struct inode *inode, u64 newsize)
1211{ 1200{
1212 int error; 1201 int ret;
1202 u64 oldsize;
1213 1203
1214 if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), S_ISREG(ip->i_inode.i_mode))) 1204 BUG_ON(!S_ISREG(inode->i_mode));
1215 return -EINVAL;
1216 1205
1217 if (size > ip->i_disksize) 1206 ret = inode_newsize_ok(inode, newsize);
1218 error = do_grow(ip, size); 1207 if (ret)
1219 else if (size < ip->i_disksize) 1208 return ret;
1220 error = do_shrink(ip, size);
1221 else
1222 /* update time stamps */
1223 error = do_touch(ip, size);
1224 1209
1225 return error; 1210 oldsize = inode->i_size;
1211 if (newsize >= oldsize)
1212 return do_grow(inode, newsize);
1213
1214 return do_shrink(inode, oldsize, newsize);
1226} 1215}
1227 1216
1228int gfs2_truncatei_resume(struct gfs2_inode *ip) 1217int gfs2_truncatei_resume(struct gfs2_inode *ip)
1229{ 1218{
1230 int error; 1219 int error;
1231 error = trunc_dealloc(ip, ip->i_disksize); 1220 error = trunc_dealloc(ip, i_size_read(&ip->i_inode));
1232 if (!error) 1221 if (!error)
1233 error = trunc_end(ip); 1222 error = trunc_end(ip);
1234 return error; 1223 return error;
@@ -1269,7 +1258,7 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
1269 1258
1270 shift = sdp->sd_sb.sb_bsize_shift; 1259 shift = sdp->sd_sb.sb_bsize_shift;
1271 BUG_ON(gfs2_is_dir(ip)); 1260 BUG_ON(gfs2_is_dir(ip));
1272 end_of_file = (ip->i_disksize + sdp->sd_sb.sb_bsize - 1) >> shift; 1261 end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift;
1273 lblock = offset >> shift; 1262 lblock = offset >> shift;
1274 lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift; 1263 lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
1275 if (lblock_stop > end_of_file) 1264 if (lblock_stop > end_of_file)
diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h
index a20a5213135a..42fea03e2bd9 100644
--- a/fs/gfs2/bmap.h
+++ b/fs/gfs2/bmap.h
@@ -44,14 +44,16 @@ static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip,
44 } 44 }
45} 45}
46 46
47int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page); 47extern int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page);
48int gfs2_block_map(struct inode *inode, sector_t lblock, struct buffer_head *bh, int create); 48extern int gfs2_block_map(struct inode *inode, sector_t lblock,
49int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen); 49 struct buffer_head *bh, int create);
50 50extern int gfs2_extent_map(struct inode *inode, u64 lblock, int *new,
51int gfs2_truncatei(struct gfs2_inode *ip, u64 size); 51 u64 *dblock, unsigned *extlen);
52int gfs2_truncatei_resume(struct gfs2_inode *ip); 52extern int gfs2_setattr_size(struct inode *inode, u64 size);
53int gfs2_file_dealloc(struct gfs2_inode *ip); 53extern void gfs2_trim_blocks(struct inode *inode);
54int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, 54extern int gfs2_truncatei_resume(struct gfs2_inode *ip);
55 unsigned int len); 55extern int gfs2_file_dealloc(struct gfs2_inode *ip);
56extern int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
57 unsigned int len);
56 58
57#endif /* __BMAP_DOT_H__ */ 59#endif /* __BMAP_DOT_H__ */
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index bb7907bde3d8..6798755b3858 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -49,7 +49,7 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
49 ip = GFS2_I(inode); 49 ip = GFS2_I(inode);
50 } 50 }
51 51
52 if (sdp->sd_args.ar_localcaching) 52 if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
53 goto valid; 53 goto valid;
54 54
55 had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL); 55 had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index b9dd88a78dd4..5c356d09c321 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -79,6 +79,9 @@
79#define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1) 79#define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1)
80#define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1)) 80#define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1))
81 81
82struct qstr gfs2_qdot __read_mostly;
83struct qstr gfs2_qdotdot __read_mostly;
84
82typedef int (*leaf_call_t) (struct gfs2_inode *dip, u32 index, u32 len, 85typedef int (*leaf_call_t) (struct gfs2_inode *dip, u32 index, u32 len,
83 u64 leaf_no, void *data); 86 u64 leaf_no, void *data);
84typedef int (*gfs2_dscan_t)(const struct gfs2_dirent *dent, 87typedef int (*gfs2_dscan_t)(const struct gfs2_dirent *dent,
@@ -127,8 +130,8 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
127 130
128 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 131 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
129 memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size); 132 memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);
130 if (ip->i_disksize < offset + size) 133 if (ip->i_inode.i_size < offset + size)
131 ip->i_disksize = offset + size; 134 i_size_write(&ip->i_inode, offset + size);
132 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 135 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
133 gfs2_dinode_out(ip, dibh->b_data); 136 gfs2_dinode_out(ip, dibh->b_data);
134 137
@@ -225,8 +228,8 @@ out:
225 if (error) 228 if (error)
226 return error; 229 return error;
227 230
228 if (ip->i_disksize < offset + copied) 231 if (ip->i_inode.i_size < offset + copied)
229 ip->i_disksize = offset + copied; 232 i_size_write(&ip->i_inode, offset + copied);
230 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 233 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
231 234
232 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 235 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
@@ -275,12 +278,13 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset,
275 unsigned int o; 278 unsigned int o;
276 int copied = 0; 279 int copied = 0;
277 int error = 0; 280 int error = 0;
281 u64 disksize = i_size_read(&ip->i_inode);
278 282
279 if (offset >= ip->i_disksize) 283 if (offset >= disksize)
280 return 0; 284 return 0;
281 285
282 if (offset + size > ip->i_disksize) 286 if (offset + size > disksize)
283 size = ip->i_disksize - offset; 287 size = disksize - offset;
284 288
285 if (!size) 289 if (!size)
286 return 0; 290 return 0;
@@ -727,7 +731,7 @@ static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
727 unsigned hsize = 1 << ip->i_depth; 731 unsigned hsize = 1 << ip->i_depth;
728 unsigned index; 732 unsigned index;
729 u64 ln; 733 u64 ln;
730 if (hsize * sizeof(u64) != ip->i_disksize) { 734 if (hsize * sizeof(u64) != i_size_read(inode)) {
731 gfs2_consist_inode(ip); 735 gfs2_consist_inode(ip);
732 return ERR_PTR(-EIO); 736 return ERR_PTR(-EIO);
733 } 737 }
@@ -879,7 +883,7 @@ static int dir_make_exhash(struct inode *inode)
879 for (x = sdp->sd_hash_ptrs; x--; lp++) 883 for (x = sdp->sd_hash_ptrs; x--; lp++)
880 *lp = cpu_to_be64(bn); 884 *lp = cpu_to_be64(bn);
881 885
882 dip->i_disksize = sdp->sd_sb.sb_bsize / 2; 886 i_size_write(inode, sdp->sd_sb.sb_bsize / 2);
883 gfs2_add_inode_blocks(&dip->i_inode, 1); 887 gfs2_add_inode_blocks(&dip->i_inode, 1);
884 dip->i_diskflags |= GFS2_DIF_EXHASH; 888 dip->i_diskflags |= GFS2_DIF_EXHASH;
885 889
@@ -1057,11 +1061,12 @@ static int dir_double_exhash(struct gfs2_inode *dip)
1057 u64 *buf; 1061 u64 *buf;
1058 u64 *from, *to; 1062 u64 *from, *to;
1059 u64 block; 1063 u64 block;
1064 u64 disksize = i_size_read(&dip->i_inode);
1060 int x; 1065 int x;
1061 int error = 0; 1066 int error = 0;
1062 1067
1063 hsize = 1 << dip->i_depth; 1068 hsize = 1 << dip->i_depth;
1064 if (hsize * sizeof(u64) != dip->i_disksize) { 1069 if (hsize * sizeof(u64) != disksize) {
1065 gfs2_consist_inode(dip); 1070 gfs2_consist_inode(dip);
1066 return -EIO; 1071 return -EIO;
1067 } 1072 }
@@ -1072,7 +1077,7 @@ static int dir_double_exhash(struct gfs2_inode *dip)
1072 if (!buf) 1077 if (!buf)
1073 return -ENOMEM; 1078 return -ENOMEM;
1074 1079
1075 for (block = dip->i_disksize >> sdp->sd_hash_bsize_shift; block--;) { 1080 for (block = disksize >> sdp->sd_hash_bsize_shift; block--;) {
1076 error = gfs2_dir_read_data(dip, (char *)buf, 1081 error = gfs2_dir_read_data(dip, (char *)buf,
1077 block * sdp->sd_hash_bsize, 1082 block * sdp->sd_hash_bsize,
1078 sdp->sd_hash_bsize, 1); 1083 sdp->sd_hash_bsize, 1);
@@ -1370,7 +1375,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
1370 unsigned depth = 0; 1375 unsigned depth = 0;
1371 1376
1372 hsize = 1 << dip->i_depth; 1377 hsize = 1 << dip->i_depth;
1373 if (hsize * sizeof(u64) != dip->i_disksize) { 1378 if (hsize * sizeof(u64) != i_size_read(inode)) {
1374 gfs2_consist_inode(dip); 1379 gfs2_consist_inode(dip);
1375 return -EIO; 1380 return -EIO;
1376 } 1381 }
@@ -1784,7 +1789,7 @@ static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
1784 int error = 0; 1789 int error = 0;
1785 1790
1786 hsize = 1 << dip->i_depth; 1791 hsize = 1 << dip->i_depth;
1787 if (hsize * sizeof(u64) != dip->i_disksize) { 1792 if (hsize * sizeof(u64) != i_size_read(&dip->i_inode)) {
1788 gfs2_consist_inode(dip); 1793 gfs2_consist_inode(dip);
1789 return -EIO; 1794 return -EIO;
1790 } 1795 }
diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h
index 4f919440c3be..a98f644bd3df 100644
--- a/fs/gfs2/dir.h
+++ b/fs/gfs2/dir.h
@@ -17,23 +17,24 @@ struct inode;
17struct gfs2_inode; 17struct gfs2_inode;
18struct gfs2_inum; 18struct gfs2_inum;
19 19
20struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *filename); 20extern struct inode *gfs2_dir_search(struct inode *dir,
21int gfs2_dir_check(struct inode *dir, const struct qstr *filename, 21 const struct qstr *filename);
22 const struct gfs2_inode *ip); 22extern int gfs2_dir_check(struct inode *dir, const struct qstr *filename,
23int gfs2_dir_add(struct inode *inode, const struct qstr *filename, 23 const struct gfs2_inode *ip);
24 const struct gfs2_inode *ip, unsigned int type); 24extern int gfs2_dir_add(struct inode *inode, const struct qstr *filename,
25int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename); 25 const struct gfs2_inode *ip, unsigned int type);
26int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque, 26extern int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *filename);
27 filldir_t filldir); 27extern int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
28int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, 28 filldir_t filldir);
29 const struct gfs2_inode *nip, unsigned int new_type); 29extern int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
30 const struct gfs2_inode *nip, unsigned int new_type);
30 31
31int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip); 32extern int gfs2_dir_exhash_dealloc(struct gfs2_inode *dip);
32 33
33int gfs2_diradd_alloc_required(struct inode *dir, 34extern int gfs2_diradd_alloc_required(struct inode *dir,
34 const struct qstr *filename); 35 const struct qstr *filename);
35int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block, 36extern int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
36 struct buffer_head **bhp); 37 struct buffer_head **bhp);
37 38
38static inline u32 gfs2_disk_hash(const char *data, int len) 39static inline u32 gfs2_disk_hash(const char *data, int len)
39{ 40{
@@ -61,4 +62,7 @@ static inline void gfs2_qstr2dirent(const struct qstr *name, u16 reclen, struct
61 memcpy(dent + 1, name->name, name->len); 62 memcpy(dent + 1, name->name, name->len);
62} 63}
63 64
65extern struct qstr gfs2_qdot;
66extern struct qstr gfs2_qdotdot;
67
64#endif /* __DIR_DOT_H__ */ 68#endif /* __DIR_DOT_H__ */
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index dfe237a3f8ad..06d582732d34 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -126,16 +126,9 @@ static int gfs2_get_name(struct dentry *parent, char *name,
126 126
127static struct dentry *gfs2_get_parent(struct dentry *child) 127static struct dentry *gfs2_get_parent(struct dentry *child)
128{ 128{
129 struct qstr dotdot;
130 struct dentry *dentry; 129 struct dentry *dentry;
131 130
132 /* 131 dentry = d_obtain_alias(gfs2_lookupi(child->d_inode, &gfs2_qdotdot, 1));
133 * XXX(hch): it would be a good idea to keep this around as a
134 * static variable.
135 */
136 gfs2_str2qstr(&dotdot, "..");
137
138 dentry = d_obtain_alias(gfs2_lookupi(child->d_inode, &dotdot, 1));
139 if (!IS_ERR(dentry)) 132 if (!IS_ERR(dentry))
140 dentry->d_op = &gfs2_dops; 133 dentry->d_op = &gfs2_dops;
141 return dentry; 134 return dentry;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 4edd662c8232..237ee6a940df 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -382,8 +382,10 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
382 rblocks = RES_DINODE + ind_blocks; 382 rblocks = RES_DINODE + ind_blocks;
383 if (gfs2_is_jdata(ip)) 383 if (gfs2_is_jdata(ip))
384 rblocks += data_blocks ? data_blocks : 1; 384 rblocks += data_blocks ? data_blocks : 1;
385 if (ind_blocks || data_blocks) 385 if (ind_blocks || data_blocks) {
386 rblocks += RES_STATFS + RES_QUOTA; 386 rblocks += RES_STATFS + RES_QUOTA;
387 rblocks += gfs2_rg_blocks(al);
388 }
387 ret = gfs2_trans_begin(sdp, rblocks, 0); 389 ret = gfs2_trans_begin(sdp, rblocks, 0);
388 if (ret) 390 if (ret)
389 goto out_trans_fail; 391 goto out_trans_fail;
@@ -491,7 +493,7 @@ static int gfs2_open(struct inode *inode, struct file *file)
491 goto fail; 493 goto fail;
492 494
493 if (!(file->f_flags & O_LARGEFILE) && 495 if (!(file->f_flags & O_LARGEFILE) &&
494 ip->i_disksize > MAX_NON_LFS) { 496 i_size_read(inode) > MAX_NON_LFS) {
495 error = -EOVERFLOW; 497 error = -EOVERFLOW;
496 goto fail_gunlock; 498 goto fail_gunlock;
497 } 499 }
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 9adf8f924e08..87778857f099 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -441,6 +441,8 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state)
441 else 441 else
442 gfs2_glock_put_nolock(gl); 442 gfs2_glock_put_nolock(gl);
443 } 443 }
444 if (held1 && held2 && list_empty(&gl->gl_holders))
445 clear_bit(GLF_QUEUED, &gl->gl_flags);
444 446
445 gl->gl_state = new_state; 447 gl->gl_state = new_state;
446 gl->gl_tchange = jiffies; 448 gl->gl_tchange = jiffies;
@@ -1012,6 +1014,7 @@ fail:
1012 if (unlikely((gh->gh_flags & LM_FLAG_PRIORITY) && !insert_pt)) 1014 if (unlikely((gh->gh_flags & LM_FLAG_PRIORITY) && !insert_pt))
1013 insert_pt = &gh2->gh_list; 1015 insert_pt = &gh2->gh_list;
1014 } 1016 }
1017 set_bit(GLF_QUEUED, &gl->gl_flags);
1015 if (likely(insert_pt == NULL)) { 1018 if (likely(insert_pt == NULL)) {
1016 list_add_tail(&gh->gh_list, &gl->gl_holders); 1019 list_add_tail(&gh->gh_list, &gl->gl_holders);
1017 if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY)) 1020 if (unlikely(gh->gh_flags & LM_FLAG_PRIORITY))
@@ -1310,10 +1313,12 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
1310 1313
1311 gfs2_glock_hold(gl); 1314 gfs2_glock_hold(gl);
1312 holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time; 1315 holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time;
1313 if (time_before(now, holdtime)) 1316 if (test_bit(GLF_QUEUED, &gl->gl_flags)) {
1314 delay = holdtime - now; 1317 if (time_before(now, holdtime))
1315 if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags)) 1318 delay = holdtime - now;
1316 delay = gl->gl_ops->go_min_hold_time; 1319 if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags))
1320 delay = gl->gl_ops->go_min_hold_time;
1321 }
1317 1322
1318 spin_lock(&gl->gl_spin); 1323 spin_lock(&gl->gl_spin);
1319 handle_callback(gl, state, delay); 1324 handle_callback(gl, state, delay);
@@ -1512,7 +1517,7 @@ static void clear_glock(struct gfs2_glock *gl)
1512 spin_unlock(&lru_lock); 1517 spin_unlock(&lru_lock);
1513 1518
1514 spin_lock(&gl->gl_spin); 1519 spin_lock(&gl->gl_spin);
1515 if (find_first_holder(gl) == NULL && gl->gl_state != LM_ST_UNLOCKED) 1520 if (gl->gl_state != LM_ST_UNLOCKED)
1516 handle_callback(gl, LM_ST_UNLOCKED, 0); 1521 handle_callback(gl, LM_ST_UNLOCKED, 0);
1517 spin_unlock(&gl->gl_spin); 1522 spin_unlock(&gl->gl_spin);
1518 gfs2_glock_hold(gl); 1523 gfs2_glock_hold(gl);
@@ -1660,6 +1665,8 @@ static const char *gflags2str(char *buf, const unsigned long *gflags)
1660 *p++ = 'I'; 1665 *p++ = 'I';
1661 if (test_bit(GLF_FROZEN, gflags)) 1666 if (test_bit(GLF_FROZEN, gflags))
1662 *p++ = 'F'; 1667 *p++ = 'F';
1668 if (test_bit(GLF_QUEUED, gflags))
1669 *p++ = 'q';
1663 *p = 0; 1670 *p = 0;
1664 return buf; 1671 return buf;
1665} 1672}
@@ -1776,10 +1783,12 @@ int __init gfs2_glock_init(void)
1776 } 1783 }
1777#endif 1784#endif
1778 1785
1779 glock_workqueue = create_workqueue("glock_workqueue"); 1786 glock_workqueue = alloc_workqueue("glock_workqueue", WQ_RESCUER |
1787 WQ_HIGHPRI | WQ_FREEZEABLE, 0);
1780 if (IS_ERR(glock_workqueue)) 1788 if (IS_ERR(glock_workqueue))
1781 return PTR_ERR(glock_workqueue); 1789 return PTR_ERR(glock_workqueue);
1782 gfs2_delete_workqueue = create_workqueue("delete_workqueue"); 1790 gfs2_delete_workqueue = alloc_workqueue("delete_workqueue", WQ_RESCUER |
1791 WQ_FREEZEABLE, 0);
1783 if (IS_ERR(gfs2_delete_workqueue)) { 1792 if (IS_ERR(gfs2_delete_workqueue)) {
1784 destroy_workqueue(glock_workqueue); 1793 destroy_workqueue(glock_workqueue);
1785 return PTR_ERR(gfs2_delete_workqueue); 1794 return PTR_ERR(gfs2_delete_workqueue);
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 2bda1911b156..db1c26d6d220 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -215,7 +215,7 @@ void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
215void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...); 215void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
216 216
217/** 217/**
218 * gfs2_glock_nq_init - intialize a holder and enqueue it on a glock 218 * gfs2_glock_nq_init - initialize a holder and enqueue it on a glock
219 * @gl: the glock 219 * @gl: the glock
220 * @state: the state we're requesting 220 * @state: the state we're requesting
221 * @flags: the modifier flags 221 * @flags: the modifier flags
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 49f97d3bb690..0d149dcc04e5 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -262,13 +262,12 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
262 const struct gfs2_inode *ip = gl->gl_object; 262 const struct gfs2_inode *ip = gl->gl_object;
263 if (ip == NULL) 263 if (ip == NULL)
264 return 0; 264 return 0;
265 gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu/%llu\n", 265 gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu\n",
266 (unsigned long long)ip->i_no_formal_ino, 266 (unsigned long long)ip->i_no_formal_ino,
267 (unsigned long long)ip->i_no_addr, 267 (unsigned long long)ip->i_no_addr,
268 IF2DT(ip->i_inode.i_mode), ip->i_flags, 268 IF2DT(ip->i_inode.i_mode), ip->i_flags,
269 (unsigned int)ip->i_diskflags, 269 (unsigned int)ip->i_diskflags,
270 (unsigned long long)ip->i_inode.i_size, 270 (unsigned long long)i_size_read(&ip->i_inode));
271 (unsigned long long)ip->i_disksize);
272 return 0; 271 return 0;
273} 272}
274 273
@@ -453,7 +452,6 @@ const struct gfs2_glock_operations *gfs2_glops_list[] = {
453 [LM_TYPE_META] = &gfs2_meta_glops, 452 [LM_TYPE_META] = &gfs2_meta_glops,
454 [LM_TYPE_INODE] = &gfs2_inode_glops, 453 [LM_TYPE_INODE] = &gfs2_inode_glops,
455 [LM_TYPE_RGRP] = &gfs2_rgrp_glops, 454 [LM_TYPE_RGRP] = &gfs2_rgrp_glops,
456 [LM_TYPE_NONDISK] = &gfs2_trans_glops,
457 [LM_TYPE_IOPEN] = &gfs2_iopen_glops, 455 [LM_TYPE_IOPEN] = &gfs2_iopen_glops,
458 [LM_TYPE_FLOCK] = &gfs2_flock_glops, 456 [LM_TYPE_FLOCK] = &gfs2_flock_glops,
459 [LM_TYPE_NONDISK] = &gfs2_nondisk_glops, 457 [LM_TYPE_NONDISK] = &gfs2_nondisk_glops,
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index fdbf4b366fa5..764fbb49efc8 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -196,6 +196,7 @@ enum {
196 GLF_REPLY_PENDING = 9, 196 GLF_REPLY_PENDING = 9,
197 GLF_INITIAL = 10, 197 GLF_INITIAL = 10,
198 GLF_FROZEN = 11, 198 GLF_FROZEN = 11,
199 GLF_QUEUED = 12,
199}; 200};
200 201
201struct gfs2_glock { 202struct gfs2_glock {
@@ -267,7 +268,6 @@ struct gfs2_inode {
267 u64 i_no_formal_ino; 268 u64 i_no_formal_ino;
268 u64 i_generation; 269 u64 i_generation;
269 u64 i_eattr; 270 u64 i_eattr;
270 loff_t i_disksize;
271 unsigned long i_flags; /* GIF_... */ 271 unsigned long i_flags; /* GIF_... */
272 struct gfs2_glock *i_gl; /* Move into i_gh? */ 272 struct gfs2_glock *i_gl; /* Move into i_gh? */
273 struct gfs2_holder i_iopen_gh; 273 struct gfs2_holder i_iopen_gh;
@@ -416,11 +416,8 @@ struct gfs2_args {
416 char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */ 416 char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */
417 char ar_hostdata[GFS2_LOCKNAME_LEN]; /* Host specific data */ 417 char ar_hostdata[GFS2_LOCKNAME_LEN]; /* Host specific data */
418 unsigned int ar_spectator:1; /* Don't get a journal */ 418 unsigned int ar_spectator:1; /* Don't get a journal */
419 unsigned int ar_ignore_local_fs:1; /* Ignore optimisations */
420 unsigned int ar_localflocks:1; /* Let the VFS do flock|fcntl */ 419 unsigned int ar_localflocks:1; /* Let the VFS do flock|fcntl */
421 unsigned int ar_localcaching:1; /* Local caching */
422 unsigned int ar_debug:1; /* Oops on errors */ 420 unsigned int ar_debug:1; /* Oops on errors */
423 unsigned int ar_upgrade:1; /* Upgrade ondisk format */
424 unsigned int ar_posix_acl:1; /* Enable posix acls */ 421 unsigned int ar_posix_acl:1; /* Enable posix acls */
425 unsigned int ar_quota:2; /* off/account/on */ 422 unsigned int ar_quota:2; /* off/account/on */
426 unsigned int ar_suiddir:1; /* suiddir support */ 423 unsigned int ar_suiddir:1; /* suiddir support */
@@ -497,7 +494,7 @@ struct gfs2_sb_host {
497 */ 494 */
498 495
499struct lm_lockstruct { 496struct lm_lockstruct {
500 unsigned int ls_jid; 497 int ls_jid;
501 unsigned int ls_first; 498 unsigned int ls_first;
502 unsigned int ls_first_done; 499 unsigned int ls_first_done;
503 unsigned int ls_nodir; 500 unsigned int ls_nodir;
@@ -572,6 +569,7 @@ struct gfs2_sbd {
572 struct list_head sd_rindex_mru_list; 569 struct list_head sd_rindex_mru_list;
573 struct gfs2_rgrpd *sd_rindex_forward; 570 struct gfs2_rgrpd *sd_rindex_forward;
574 unsigned int sd_rgrps; 571 unsigned int sd_rgrps;
572 unsigned int sd_max_rg_data;
575 573
576 /* Journal index stuff */ 574 /* Journal index stuff */
577 575
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 08140f185a37..06370f8bd8cf 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -359,8 +359,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
359 * to do that. 359 * to do that.
360 */ 360 */
361 ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink); 361 ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink);
362 ip->i_disksize = be64_to_cpu(str->di_size); 362 i_size_write(&ip->i_inode, be64_to_cpu(str->di_size));
363 i_size_write(&ip->i_inode, ip->i_disksize);
364 gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks)); 363 gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
365 atime.tv_sec = be64_to_cpu(str->di_atime); 364 atime.tv_sec = be64_to_cpu(str->di_atime);
366 atime.tv_nsec = be32_to_cpu(str->di_atime_nsec); 365 atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
@@ -1055,7 +1054,7 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
1055 str->di_uid = cpu_to_be32(ip->i_inode.i_uid); 1054 str->di_uid = cpu_to_be32(ip->i_inode.i_uid);
1056 str->di_gid = cpu_to_be32(ip->i_inode.i_gid); 1055 str->di_gid = cpu_to_be32(ip->i_inode.i_gid);
1057 str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink); 1056 str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink);
1058 str->di_size = cpu_to_be64(ip->i_disksize); 1057 str->di_size = cpu_to_be64(i_size_read(&ip->i_inode));
1059 str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode)); 1058 str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
1060 str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec); 1059 str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec);
1061 str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec); 1060 str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec);
@@ -1085,8 +1084,8 @@ void gfs2_dinode_print(const struct gfs2_inode *ip)
1085 (unsigned long long)ip->i_no_formal_ino); 1084 (unsigned long long)ip->i_no_formal_ino);
1086 printk(KERN_INFO " no_addr = %llu\n", 1085 printk(KERN_INFO " no_addr = %llu\n",
1087 (unsigned long long)ip->i_no_addr); 1086 (unsigned long long)ip->i_no_addr);
1088 printk(KERN_INFO " i_disksize = %llu\n", 1087 printk(KERN_INFO " i_size = %llu\n",
1089 (unsigned long long)ip->i_disksize); 1088 (unsigned long long)i_size_read(&ip->i_inode));
1090 printk(KERN_INFO " blocks = %llu\n", 1089 printk(KERN_INFO " blocks = %llu\n",
1091 (unsigned long long)gfs2_get_inode_blocks(&ip->i_inode)); 1090 (unsigned long long)gfs2_get_inode_blocks(&ip->i_inode));
1092 printk(KERN_INFO " i_goal = %llu\n", 1091 printk(KERN_INFO " i_goal = %llu\n",
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 300ada3f21de..6720d7d5fbc6 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -19,6 +19,8 @@ extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask);
19extern int gfs2_internal_read(struct gfs2_inode *ip, 19extern int gfs2_internal_read(struct gfs2_inode *ip,
20 struct file_ra_state *ra_state, 20 struct file_ra_state *ra_state,
21 char *buf, loff_t *pos, unsigned size); 21 char *buf, loff_t *pos, unsigned size);
22extern void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
23 unsigned int from, unsigned int to);
22extern void gfs2_set_aops(struct inode *inode); 24extern void gfs2_set_aops(struct inode *inode);
23 25
24static inline int gfs2_is_stuffed(const struct gfs2_inode *ip) 26static inline int gfs2_is_stuffed(const struct gfs2_inode *ip)
@@ -80,6 +82,19 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip,
80 dent->de_inum.no_addr = cpu_to_be64(ip->i_no_addr); 82 dent->de_inum.no_addr = cpu_to_be64(ip->i_no_addr);
81} 83}
82 84
85static inline int gfs2_check_internal_file_size(struct inode *inode,
86 u64 minsize, u64 maxsize)
87{
88 u64 size = i_size_read(inode);
89 if (size < minsize || size > maxsize)
90 goto err;
91 if (size & ((1 << inode->i_blkbits) - 1))
92 goto err;
93 return 0;
94err:
95 gfs2_consist_inode(GFS2_I(inode));
96 return -EIO;
97}
83 98
84extern void gfs2_set_iop(struct inode *inode); 99extern void gfs2_set_iop(struct inode *inode);
85extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 100extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type,
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 0e0470ed34c2..1c09425b45fd 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -42,9 +42,9 @@ static void gdlm_ast(void *arg)
42 ret |= LM_OUT_CANCELED; 42 ret |= LM_OUT_CANCELED;
43 goto out; 43 goto out;
44 case -EAGAIN: /* Try lock fails */ 44 case -EAGAIN: /* Try lock fails */
45 case -EDEADLK: /* Deadlock detected */
45 goto out; 46 goto out;
46 case -EINVAL: /* Invalid */ 47 case -ETIMEDOUT: /* Canceled due to timeout */
47 case -ENOMEM: /* Out of memory */
48 ret |= LM_OUT_ERROR; 48 ret |= LM_OUT_ERROR;
49 goto out; 49 goto out;
50 case 0: /* Success */ 50 case 0: /* Success */
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index b1e9630eb46a..d7eb1e209aa8 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -24,6 +24,7 @@
24#include "glock.h" 24#include "glock.h"
25#include "quota.h" 25#include "quota.h"
26#include "recovery.h" 26#include "recovery.h"
27#include "dir.h"
27 28
28static struct shrinker qd_shrinker = { 29static struct shrinker qd_shrinker = {
29 .shrink = gfs2_shrink_qd_memory, 30 .shrink = gfs2_shrink_qd_memory,
@@ -78,6 +79,9 @@ static int __init init_gfs2_fs(void)
78{ 79{
79 int error; 80 int error;
80 81
82 gfs2_str2qstr(&gfs2_qdot, ".");
83 gfs2_str2qstr(&gfs2_qdotdot, "..");
84
81 error = gfs2_sys_init(); 85 error = gfs2_sys_init();
82 if (error) 86 if (error)
83 return error; 87 return error;
@@ -140,7 +144,7 @@ static int __init init_gfs2_fs(void)
140 144
141 error = -ENOMEM; 145 error = -ENOMEM;
142 gfs_recovery_wq = alloc_workqueue("gfs_recovery", 146 gfs_recovery_wq = alloc_workqueue("gfs_recovery",
143 WQ_NON_REENTRANT | WQ_RESCUER, 0); 147 WQ_RESCUER | WQ_FREEZEABLE, 0);
144 if (!gfs_recovery_wq) 148 if (!gfs_recovery_wq)
145 goto fail_wq; 149 goto fail_wq;
146 150
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 4d4b1e8ac64c..aeafc233dc89 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -38,14 +38,6 @@
38#define DO 0 38#define DO 0
39#define UNDO 1 39#define UNDO 1
40 40
41static const u32 gfs2_old_fs_formats[] = {
42 0
43};
44
45static const u32 gfs2_old_multihost_formats[] = {
46 0
47};
48
49/** 41/**
50 * gfs2_tune_init - Fill a gfs2_tune structure with default values 42 * gfs2_tune_init - Fill a gfs2_tune structure with default values
51 * @gt: tune 43 * @gt: tune
@@ -135,8 +127,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
135 127
136static int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent) 128static int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent)
137{ 129{
138 unsigned int x;
139
140 if (sb->sb_magic != GFS2_MAGIC || 130 if (sb->sb_magic != GFS2_MAGIC ||
141 sb->sb_type != GFS2_METATYPE_SB) { 131 sb->sb_type != GFS2_METATYPE_SB) {
142 if (!silent) 132 if (!silent)
@@ -150,55 +140,9 @@ static int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int sile
150 sb->sb_multihost_format == GFS2_FORMAT_MULTI) 140 sb->sb_multihost_format == GFS2_FORMAT_MULTI)
151 return 0; 141 return 0;
152 142
153 if (sb->sb_fs_format != GFS2_FORMAT_FS) { 143 fs_warn(sdp, "Unknown on-disk format, unable to mount\n");
154 for (x = 0; gfs2_old_fs_formats[x]; x++)
155 if (gfs2_old_fs_formats[x] == sb->sb_fs_format)
156 break;
157 144
158 if (!gfs2_old_fs_formats[x]) { 145 return -EINVAL;
159 printk(KERN_WARNING
160 "GFS2: code version (%u, %u) is incompatible "
161 "with ondisk format (%u, %u)\n",
162 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
163 sb->sb_fs_format, sb->sb_multihost_format);
164 printk(KERN_WARNING
165 "GFS2: I don't know how to upgrade this FS\n");
166 return -EINVAL;
167 }
168 }
169
170 if (sb->sb_multihost_format != GFS2_FORMAT_MULTI) {
171 for (x = 0; gfs2_old_multihost_formats[x]; x++)
172 if (gfs2_old_multihost_formats[x] ==
173 sb->sb_multihost_format)
174 break;
175
176 if (!gfs2_old_multihost_formats[x]) {
177 printk(KERN_WARNING
178 "GFS2: code version (%u, %u) is incompatible "
179 "with ondisk format (%u, %u)\n",
180 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
181 sb->sb_fs_format, sb->sb_multihost_format);
182 printk(KERN_WARNING
183 "GFS2: I don't know how to upgrade this FS\n");
184 return -EINVAL;
185 }
186 }
187
188 if (!sdp->sd_args.ar_upgrade) {
189 printk(KERN_WARNING
190 "GFS2: code version (%u, %u) is incompatible "
191 "with ondisk format (%u, %u)\n",
192 GFS2_FORMAT_FS, GFS2_FORMAT_MULTI,
193 sb->sb_fs_format, sb->sb_multihost_format);
194 printk(KERN_INFO
195 "GFS2: Use the \"upgrade\" mount option to upgrade "
196 "the FS\n");
197 printk(KERN_INFO "GFS2: See the manual for more details\n");
198 return -EINVAL;
199 }
200
201 return 0;
202} 146}
203 147
204static void end_bio_io_page(struct bio *bio, int error) 148static void end_bio_io_page(struct bio *bio, int error)
@@ -586,7 +530,7 @@ static int map_journal_extents(struct gfs2_sbd *sdp)
586 530
587 prev_db = 0; 531 prev_db = 0;
588 532
589 for (lb = 0; lb < ip->i_disksize >> sdp->sd_sb.sb_bsize_shift; lb++) { 533 for (lb = 0; lb < i_size_read(jd->jd_inode) >> sdp->sd_sb.sb_bsize_shift; lb++) {
590 bh.b_state = 0; 534 bh.b_state = 0;
591 bh.b_blocknr = 0; 535 bh.b_blocknr = 0;
592 bh.b_size = 1 << ip->i_inode.i_blkbits; 536 bh.b_size = 1 << ip->i_inode.i_blkbits;
@@ -1022,7 +966,6 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
1022 if (!strcmp("lock_nolock", proto)) { 966 if (!strcmp("lock_nolock", proto)) {
1023 lm = &nolock_ops; 967 lm = &nolock_ops;
1024 sdp->sd_args.ar_localflocks = 1; 968 sdp->sd_args.ar_localflocks = 1;
1025 sdp->sd_args.ar_localcaching = 1;
1026#ifdef CONFIG_GFS2_FS_LOCKING_DLM 969#ifdef CONFIG_GFS2_FS_LOCKING_DLM
1027 } else if (!strcmp("lock_dlm", proto)) { 970 } else if (!strcmp("lock_dlm", proto)) {
1028 lm = &gfs2_dlm_ops; 971 lm = &gfs2_dlm_ops;
@@ -1113,8 +1056,6 @@ static int gfs2_journalid_wait(void *word)
1113 1056
1114static int wait_on_journal(struct gfs2_sbd *sdp) 1057static int wait_on_journal(struct gfs2_sbd *sdp)
1115{ 1058{
1116 if (sdp->sd_args.ar_spectator)
1117 return 0;
1118 if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) 1059 if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
1119 return 0; 1060 return 0;
1120 1061
@@ -1217,6 +1158,20 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
1217 if (error) 1158 if (error)
1218 goto fail_sb; 1159 goto fail_sb;
1219 1160
1161 /*
1162 * If user space has failed to join the cluster or some similar
1163 * failure has occurred, then the journal id will contain a
1164 * negative (error) number. This will then be returned to the
1165 * caller (of the mount syscall). We do this even for spectator
1166 * mounts (which just write a jid of 0 to indicate "ok" even though
1167 * the jid is unused in the spectator case)
1168 */
1169 if (sdp->sd_lockstruct.ls_jid < 0) {
1170 error = sdp->sd_lockstruct.ls_jid;
1171 sdp->sd_lockstruct.ls_jid = 0;
1172 goto fail_sb;
1173 }
1174
1220 error = init_inodes(sdp, DO); 1175 error = init_inodes(sdp, DO);
1221 if (error) 1176 if (error)
1222 goto fail_sb; 1177 goto fail_sb;
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 1009be2c9737..0534510200d5 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -18,6 +18,8 @@
18#include <linux/gfs2_ondisk.h> 18#include <linux/gfs2_ondisk.h>
19#include <linux/crc32.h> 19#include <linux/crc32.h>
20#include <linux/fiemap.h> 20#include <linux/fiemap.h>
21#include <linux/swap.h>
22#include <linux/falloc.h>
21#include <asm/uaccess.h> 23#include <asm/uaccess.h>
22 24
23#include "gfs2.h" 25#include "gfs2.h"
@@ -217,7 +219,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
217 goto out_gunlock_q; 219 goto out_gunlock_q;
218 220
219 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + 221 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
220 al->al_rgd->rd_length + 222 gfs2_rg_blocks(al) +
221 2 * RES_DINODE + RES_STATFS + 223 2 * RES_DINODE + RES_STATFS +
222 RES_QUOTA, 0); 224 RES_QUOTA, 0);
223 if (error) 225 if (error)
@@ -406,7 +408,6 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry,
406 408
407 ip = ghs[1].gh_gl->gl_object; 409 ip = ghs[1].gh_gl->gl_object;
408 410
409 ip->i_disksize = size;
410 i_size_write(inode, size); 411 i_size_write(inode, size);
411 412
412 error = gfs2_meta_inode_buffer(ip, &dibh); 413 error = gfs2_meta_inode_buffer(ip, &dibh);
@@ -461,7 +462,7 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
461 ip = ghs[1].gh_gl->gl_object; 462 ip = ghs[1].gh_gl->gl_object;
462 463
463 ip->i_inode.i_nlink = 2; 464 ip->i_inode.i_nlink = 2;
464 ip->i_disksize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode); 465 i_size_write(inode, sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode));
465 ip->i_diskflags |= GFS2_DIF_JDATA; 466 ip->i_diskflags |= GFS2_DIF_JDATA;
466 ip->i_entries = 2; 467 ip->i_entries = 2;
467 468
@@ -470,18 +471,15 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
470 if (!gfs2_assert_withdraw(sdp, !error)) { 471 if (!gfs2_assert_withdraw(sdp, !error)) {
471 struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data; 472 struct gfs2_dinode *di = (struct gfs2_dinode *)dibh->b_data;
472 struct gfs2_dirent *dent = (struct gfs2_dirent *)(di+1); 473 struct gfs2_dirent *dent = (struct gfs2_dirent *)(di+1);
473 struct qstr str;
474 474
475 gfs2_str2qstr(&str, ".");
476 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 475 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
477 gfs2_qstr2dirent(&str, GFS2_DIRENT_SIZE(str.len), dent); 476 gfs2_qstr2dirent(&gfs2_qdot, GFS2_DIRENT_SIZE(gfs2_qdot.len), dent);
478 dent->de_inum = di->di_num; /* already GFS2 endian */ 477 dent->de_inum = di->di_num; /* already GFS2 endian */
479 dent->de_type = cpu_to_be16(DT_DIR); 478 dent->de_type = cpu_to_be16(DT_DIR);
480 di->di_entries = cpu_to_be32(1); 479 di->di_entries = cpu_to_be32(1);
481 480
482 gfs2_str2qstr(&str, "..");
483 dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1)); 481 dent = (struct gfs2_dirent *)((char*)dent + GFS2_DIRENT_SIZE(1));
484 gfs2_qstr2dirent(&str, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent); 482 gfs2_qstr2dirent(&gfs2_qdotdot, dibh->b_size - GFS2_DIRENT_SIZE(1) - sizeof(struct gfs2_dinode), dent);
485 483
486 gfs2_inum_out(dip, dent); 484 gfs2_inum_out(dip, dent);
487 dent->de_type = cpu_to_be16(DT_DIR); 485 dent->de_type = cpu_to_be16(DT_DIR);
@@ -522,7 +520,6 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
522static int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name, 520static int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
523 struct gfs2_inode *ip) 521 struct gfs2_inode *ip)
524{ 522{
525 struct qstr dotname;
526 int error; 523 int error;
527 524
528 if (ip->i_entries != 2) { 525 if (ip->i_entries != 2) {
@@ -539,13 +536,11 @@ static int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
539 if (error) 536 if (error)
540 return error; 537 return error;
541 538
542 gfs2_str2qstr(&dotname, "."); 539 error = gfs2_dir_del(ip, &gfs2_qdot);
543 error = gfs2_dir_del(ip, &dotname);
544 if (error) 540 if (error)
545 return error; 541 return error;
546 542
547 gfs2_str2qstr(&dotname, ".."); 543 error = gfs2_dir_del(ip, &gfs2_qdotdot);
548 error = gfs2_dir_del(ip, &dotname);
549 if (error) 544 if (error)
550 return error; 545 return error;
551 546
@@ -694,11 +689,8 @@ static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
694 struct inode *dir = &to->i_inode; 689 struct inode *dir = &to->i_inode;
695 struct super_block *sb = dir->i_sb; 690 struct super_block *sb = dir->i_sb;
696 struct inode *tmp; 691 struct inode *tmp;
697 struct qstr dotdot;
698 int error = 0; 692 int error = 0;
699 693
700 gfs2_str2qstr(&dotdot, "..");
701
702 igrab(dir); 694 igrab(dir);
703 695
704 for (;;) { 696 for (;;) {
@@ -711,7 +703,7 @@ static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to)
711 break; 703 break;
712 } 704 }
713 705
714 tmp = gfs2_lookupi(dir, &dotdot, 1); 706 tmp = gfs2_lookupi(dir, &gfs2_qdotdot, 1);
715 if (IS_ERR(tmp)) { 707 if (IS_ERR(tmp)) {
716 error = PTR_ERR(tmp); 708 error = PTR_ERR(tmp);
717 break; 709 break;
@@ -744,7 +736,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
744 struct gfs2_inode *ip = GFS2_I(odentry->d_inode); 736 struct gfs2_inode *ip = GFS2_I(odentry->d_inode);
745 struct gfs2_inode *nip = NULL; 737 struct gfs2_inode *nip = NULL;
746 struct gfs2_sbd *sdp = GFS2_SB(odir); 738 struct gfs2_sbd *sdp = GFS2_SB(odir);
747 struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }; 739 struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }, ri_gh;
748 struct gfs2_rgrpd *nrgd; 740 struct gfs2_rgrpd *nrgd;
749 unsigned int num_gh; 741 unsigned int num_gh;
750 int dir_rename = 0; 742 int dir_rename = 0;
@@ -758,6 +750,9 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
758 return 0; 750 return 0;
759 } 751 }
760 752
753 error = gfs2_rindex_hold(sdp, &ri_gh);
754 if (error)
755 return error;
761 756
762 if (odip != ndip) { 757 if (odip != ndip) {
763 error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE, 758 error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE,
@@ -887,12 +882,12 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
887 882
888 al->al_requested = sdp->sd_max_dirres; 883 al->al_requested = sdp->sd_max_dirres;
889 884
890 error = gfs2_inplace_reserve(ndip); 885 error = gfs2_inplace_reserve_ri(ndip);
891 if (error) 886 if (error)
892 goto out_gunlock_q; 887 goto out_gunlock_q;
893 888
894 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres + 889 error = gfs2_trans_begin(sdp, sdp->sd_max_dirres +
895 al->al_rgd->rd_length + 890 gfs2_rg_blocks(al) +
896 4 * RES_DINODE + 4 * RES_LEAF + 891 4 * RES_DINODE + 4 * RES_LEAF +
897 RES_STATFS + RES_QUOTA + 4, 0); 892 RES_STATFS + RES_QUOTA + 4, 0);
898 if (error) 893 if (error)
@@ -920,9 +915,6 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
920 } 915 }
921 916
922 if (dir_rename) { 917 if (dir_rename) {
923 struct qstr name;
924 gfs2_str2qstr(&name, "..");
925
926 error = gfs2_change_nlink(ndip, +1); 918 error = gfs2_change_nlink(ndip, +1);
927 if (error) 919 if (error)
928 goto out_end_trans; 920 goto out_end_trans;
@@ -930,7 +922,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
930 if (error) 922 if (error)
931 goto out_end_trans; 923 goto out_end_trans;
932 924
933 error = gfs2_dir_mvino(ip, &name, ndip, DT_DIR); 925 error = gfs2_dir_mvino(ip, &gfs2_qdotdot, ndip, DT_DIR);
934 if (error) 926 if (error)
935 goto out_end_trans; 927 goto out_end_trans;
936 } else { 928 } else {
@@ -972,6 +964,7 @@ out_gunlock_r:
972 if (r_gh.gh_gl) 964 if (r_gh.gh_gl)
973 gfs2_glock_dq_uninit(&r_gh); 965 gfs2_glock_dq_uninit(&r_gh);
974out: 966out:
967 gfs2_glock_dq_uninit(&ri_gh);
975 return error; 968 return error;
976} 969}
977 970
@@ -990,7 +983,7 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
990 struct gfs2_inode *ip = GFS2_I(dentry->d_inode); 983 struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
991 struct gfs2_holder i_gh; 984 struct gfs2_holder i_gh;
992 struct buffer_head *dibh; 985 struct buffer_head *dibh;
993 unsigned int x; 986 unsigned int x, size;
994 char *buf; 987 char *buf;
995 int error; 988 int error;
996 989
@@ -1002,7 +995,8 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
1002 return NULL; 995 return NULL;
1003 } 996 }
1004 997
1005 if (!ip->i_disksize) { 998 size = (unsigned int)i_size_read(&ip->i_inode);
999 if (size == 0) {
1006 gfs2_consist_inode(ip); 1000 gfs2_consist_inode(ip);
1007 buf = ERR_PTR(-EIO); 1001 buf = ERR_PTR(-EIO);
1008 goto out; 1002 goto out;
@@ -1014,7 +1008,7 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd)
1014 goto out; 1008 goto out;
1015 } 1009 }
1016 1010
1017 x = ip->i_disksize + 1; 1011 x = size + 1;
1018 buf = kmalloc(x, GFP_NOFS); 1012 buf = kmalloc(x, GFP_NOFS);
1019 if (!buf) 1013 if (!buf)
1020 buf = ERR_PTR(-ENOMEM); 1014 buf = ERR_PTR(-ENOMEM);
@@ -1071,30 +1065,6 @@ int gfs2_permission(struct inode *inode, int mask)
1071 return error; 1065 return error;
1072} 1066}
1073 1067
1074/*
1075 * XXX(truncate): the truncate_setsize calls should be moved to the end.
1076 */
1077static int setattr_size(struct inode *inode, struct iattr *attr)
1078{
1079 struct gfs2_inode *ip = GFS2_I(inode);
1080 struct gfs2_sbd *sdp = GFS2_SB(inode);
1081 int error;
1082
1083 if (attr->ia_size != ip->i_disksize) {
1084 error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
1085 if (error)
1086 return error;
1087 truncate_setsize(inode, attr->ia_size);
1088 gfs2_trans_end(sdp);
1089 }
1090
1091 error = gfs2_truncatei(ip, attr->ia_size);
1092 if (error && (inode->i_size != ip->i_disksize))
1093 i_size_write(inode, ip->i_disksize);
1094
1095 return error;
1096}
1097
1098static int setattr_chown(struct inode *inode, struct iattr *attr) 1068static int setattr_chown(struct inode *inode, struct iattr *attr)
1099{ 1069{
1100 struct gfs2_inode *ip = GFS2_I(inode); 1070 struct gfs2_inode *ip = GFS2_I(inode);
@@ -1195,7 +1165,7 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
1195 goto out; 1165 goto out;
1196 1166
1197 if (attr->ia_valid & ATTR_SIZE) 1167 if (attr->ia_valid & ATTR_SIZE)
1198 error = setattr_size(inode, attr); 1168 error = gfs2_setattr_size(inode, attr->ia_size);
1199 else if (attr->ia_valid & (ATTR_UID | ATTR_GID)) 1169 else if (attr->ia_valid & (ATTR_UID | ATTR_GID))
1200 error = setattr_chown(inode, attr); 1170 error = setattr_chown(inode, attr);
1201 else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode)) 1171 else if ((attr->ia_valid & ATTR_MODE) && IS_POSIXACL(inode))
@@ -1301,6 +1271,257 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name)
1301 return ret; 1271 return ret;
1302} 1272}
1303 1273
1274static void empty_write_end(struct page *page, unsigned from,
1275 unsigned to)
1276{
1277 struct gfs2_inode *ip = GFS2_I(page->mapping->host);
1278
1279 page_zero_new_buffers(page, from, to);
1280 flush_dcache_page(page);
1281 mark_page_accessed(page);
1282
1283 if (!gfs2_is_writeback(ip))
1284 gfs2_page_add_databufs(ip, page, from, to);
1285
1286 block_commit_write(page, from, to);
1287}
1288
1289
1290static int write_empty_blocks(struct page *page, unsigned from, unsigned to)
1291{
1292 unsigned start, end, next;
1293 struct buffer_head *bh, *head;
1294 int error;
1295
1296 if (!page_has_buffers(page)) {
1297 error = block_prepare_write(page, from, to, gfs2_block_map);
1298 if (unlikely(error))
1299 return error;
1300
1301 empty_write_end(page, from, to);
1302 return 0;
1303 }
1304
1305 bh = head = page_buffers(page);
1306 next = end = 0;
1307 while (next < from) {
1308 next += bh->b_size;
1309 bh = bh->b_this_page;
1310 }
1311 start = next;
1312 do {
1313 next += bh->b_size;
1314 if (buffer_mapped(bh)) {
1315 if (end) {
1316 error = block_prepare_write(page, start, end,
1317 gfs2_block_map);
1318 if (unlikely(error))
1319 return error;
1320 empty_write_end(page, start, end);
1321 end = 0;
1322 }
1323 start = next;
1324 }
1325 else
1326 end = next;
1327 bh = bh->b_this_page;
1328 } while (next < to);
1329
1330 if (end) {
1331 error = block_prepare_write(page, start, end, gfs2_block_map);
1332 if (unlikely(error))
1333 return error;
1334 empty_write_end(page, start, end);
1335 }
1336
1337 return 0;
1338}
1339
1340static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
1341 int mode)
1342{
1343 struct gfs2_inode *ip = GFS2_I(inode);
1344 struct buffer_head *dibh;
1345 int error;
1346 u64 start = offset >> PAGE_CACHE_SHIFT;
1347 unsigned int start_offset = offset & ~PAGE_CACHE_MASK;
1348 u64 end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
1349 pgoff_t curr;
1350 struct page *page;
1351 unsigned int end_offset = (offset + len) & ~PAGE_CACHE_MASK;
1352 unsigned int from, to;
1353
1354 if (!end_offset)
1355 end_offset = PAGE_CACHE_SIZE;
1356
1357 error = gfs2_meta_inode_buffer(ip, &dibh);
1358 if (unlikely(error))
1359 goto out;
1360
1361 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
1362
1363 if (gfs2_is_stuffed(ip)) {
1364 error = gfs2_unstuff_dinode(ip, NULL);
1365 if (unlikely(error))
1366 goto out;
1367 }
1368
1369 curr = start;
1370 offset = start << PAGE_CACHE_SHIFT;
1371 from = start_offset;
1372 to = PAGE_CACHE_SIZE;
1373 while (curr <= end) {
1374 page = grab_cache_page_write_begin(inode->i_mapping, curr,
1375 AOP_FLAG_NOFS);
1376 if (unlikely(!page)) {
1377 error = -ENOMEM;
1378 goto out;
1379 }
1380
1381 if (curr == end)
1382 to = end_offset;
1383 error = write_empty_blocks(page, from, to);
1384 if (!error && offset + to > inode->i_size &&
1385 !(mode & FALLOC_FL_KEEP_SIZE)) {
1386 i_size_write(inode, offset + to);
1387 }
1388 unlock_page(page);
1389 page_cache_release(page);
1390 if (error)
1391 goto out;
1392 curr++;
1393 offset += PAGE_CACHE_SIZE;
1394 from = 0;
1395 }
1396
1397 gfs2_dinode_out(ip, dibh->b_data);
1398 mark_inode_dirty(inode);
1399
1400 brelse(dibh);
1401
1402out:
1403 return error;
1404}
1405
1406static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len,
1407 unsigned int *data_blocks, unsigned int *ind_blocks)
1408{
1409 const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1410 unsigned int max_blocks = ip->i_alloc->al_rgd->rd_free_clone;
1411 unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1);
1412
1413 for (tmp = max_data; tmp > sdp->sd_diptrs;) {
1414 tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs);
1415 max_data -= tmp;
1416 }
1417 /* This calculation isn't the exact reverse of gfs2_write_calc_reserve,
1418 so it might end up with fewer data blocks */
1419 if (max_data <= *data_blocks)
1420 return;
1421 *data_blocks = max_data;
1422 *ind_blocks = max_blocks - max_data;
1423 *len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift;
1424 if (*len > max) {
1425 *len = max;
1426 gfs2_write_calc_reserv(ip, max, data_blocks, ind_blocks);
1427 }
1428}
1429
1430static long gfs2_fallocate(struct inode *inode, int mode, loff_t offset,
1431 loff_t len)
1432{
1433 struct gfs2_sbd *sdp = GFS2_SB(inode);
1434 struct gfs2_inode *ip = GFS2_I(inode);
1435 unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
1436 loff_t bytes, max_bytes;
1437 struct gfs2_alloc *al;
1438 int error;
1439 loff_t next = (offset + len - 1) >> sdp->sd_sb.sb_bsize_shift;
1440 next = (next + 1) << sdp->sd_sb.sb_bsize_shift;
1441
1442 offset = (offset >> sdp->sd_sb.sb_bsize_shift) <<
1443 sdp->sd_sb.sb_bsize_shift;
1444
1445 len = next - offset;
1446 bytes = sdp->sd_max_rg_data * sdp->sd_sb.sb_bsize / 2;
1447 if (!bytes)
1448 bytes = UINT_MAX;
1449
1450 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
1451 error = gfs2_glock_nq(&ip->i_gh);
1452 if (unlikely(error))
1453 goto out_uninit;
1454
1455 if (!gfs2_write_alloc_required(ip, offset, len))
1456 goto out_unlock;
1457
1458 while (len > 0) {
1459 if (len < bytes)
1460 bytes = len;
1461 al = gfs2_alloc_get(ip);
1462 if (!al) {
1463 error = -ENOMEM;
1464 goto out_unlock;
1465 }
1466
1467 error = gfs2_quota_lock_check(ip);
1468 if (error)
1469 goto out_alloc_put;
1470
1471retry:
1472 gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
1473
1474 al->al_requested = data_blocks + ind_blocks;
1475 error = gfs2_inplace_reserve(ip);
1476 if (error) {
1477 if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {
1478 bytes >>= 1;
1479 goto retry;
1480 }
1481 goto out_qunlock;
1482 }
1483 max_bytes = bytes;
1484 calc_max_reserv(ip, len, &max_bytes, &data_blocks, &ind_blocks);
1485 al->al_requested = data_blocks + ind_blocks;
1486
1487 rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA +
1488 RES_RG_HDR + gfs2_rg_blocks(al);
1489 if (gfs2_is_jdata(ip))
1490 rblocks += data_blocks ? data_blocks : 1;
1491
1492 error = gfs2_trans_begin(sdp, rblocks,
1493 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
1494 if (error)
1495 goto out_trans_fail;
1496
1497 error = fallocate_chunk(inode, offset, max_bytes, mode);
1498 gfs2_trans_end(sdp);
1499
1500 if (error)
1501 goto out_trans_fail;
1502
1503 len -= max_bytes;
1504 offset += max_bytes;
1505 gfs2_inplace_release(ip);
1506 gfs2_quota_unlock(ip);
1507 gfs2_alloc_put(ip);
1508 }
1509 goto out_unlock;
1510
1511out_trans_fail:
1512 gfs2_inplace_release(ip);
1513out_qunlock:
1514 gfs2_quota_unlock(ip);
1515out_alloc_put:
1516 gfs2_alloc_put(ip);
1517out_unlock:
1518 gfs2_glock_dq(&ip->i_gh);
1519out_uninit:
1520 gfs2_holder_uninit(&ip->i_gh);
1521 return error;
1522}
1523
1524
1304static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 1525static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
1305 u64 start, u64 len) 1526 u64 start, u64 len)
1306{ 1527{
@@ -1351,6 +1572,7 @@ const struct inode_operations gfs2_file_iops = {
1351 .getxattr = gfs2_getxattr, 1572 .getxattr = gfs2_getxattr,
1352 .listxattr = gfs2_listxattr, 1573 .listxattr = gfs2_listxattr,
1353 .removexattr = gfs2_removexattr, 1574 .removexattr = gfs2_removexattr,
1575 .fallocate = gfs2_fallocate,
1354 .fiemap = gfs2_fiemap, 1576 .fiemap = gfs2_fiemap,
1355}; 1577};
1356 1578
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 1bc6b5695e6d..58a9b9998b42 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -735,10 +735,8 @@ get_a_page:
735 goto out; 735 goto out;
736 736
737 size = loc + sizeof(struct gfs2_quota); 737 size = loc + sizeof(struct gfs2_quota);
738 if (size > inode->i_size) { 738 if (size > inode->i_size)
739 ip->i_disksize = size;
740 i_size_write(inode, size); 739 i_size_write(inode, size);
741 }
742 inode->i_mtime = inode->i_atime = CURRENT_TIME; 740 inode->i_mtime = inode->i_atime = CURRENT_TIME;
743 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 741 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
744 gfs2_dinode_out(ip, dibh->b_data); 742 gfs2_dinode_out(ip, dibh->b_data);
@@ -817,7 +815,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
817 goto out_alloc; 815 goto out_alloc;
818 816
819 if (nalloc) 817 if (nalloc)
820 blocks += al->al_rgd->rd_length + nalloc * ind_blocks + RES_STATFS; 818 blocks += gfs2_rg_blocks(al) + nalloc * ind_blocks + RES_STATFS;
821 819
822 error = gfs2_trans_begin(sdp, blocks, 0); 820 error = gfs2_trans_begin(sdp, blocks, 0);
823 if (error) 821 if (error)
@@ -1190,18 +1188,17 @@ static void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void *
1190int gfs2_quota_init(struct gfs2_sbd *sdp) 1188int gfs2_quota_init(struct gfs2_sbd *sdp)
1191{ 1189{
1192 struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode); 1190 struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode);
1193 unsigned int blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift; 1191 u64 size = i_size_read(sdp->sd_qc_inode);
1192 unsigned int blocks = size >> sdp->sd_sb.sb_bsize_shift;
1194 unsigned int x, slot = 0; 1193 unsigned int x, slot = 0;
1195 unsigned int found = 0; 1194 unsigned int found = 0;
1196 u64 dblock; 1195 u64 dblock;
1197 u32 extlen = 0; 1196 u32 extlen = 0;
1198 int error; 1197 int error;
1199 1198
1200 if (!ip->i_disksize || ip->i_disksize > (64 << 20) || 1199 if (gfs2_check_internal_file_size(sdp->sd_qc_inode, 1, 64 << 20))
1201 ip->i_disksize & (sdp->sd_sb.sb_bsize - 1)) {
1202 gfs2_consist_inode(ip);
1203 return -EIO; 1200 return -EIO;
1204 } 1201
1205 sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block; 1202 sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block;
1206 sdp->sd_quota_chunks = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * PAGE_SIZE); 1203 sdp->sd_quota_chunks = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * PAGE_SIZE);
1207 1204
@@ -1589,6 +1586,7 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id,
1589 error = gfs2_inplace_reserve(ip); 1586 error = gfs2_inplace_reserve(ip);
1590 if (error) 1587 if (error)
1591 goto out_alloc; 1588 goto out_alloc;
1589 blocks += gfs2_rg_blocks(al);
1592 } 1590 }
1593 1591
1594 error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 1, 0); 1592 error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 1, 0);
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index f7f89a94a5a4..f2a02edcac8f 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -455,11 +455,13 @@ void gfs2_recover_func(struct work_struct *work)
455 int ro = 0; 455 int ro = 0;
456 unsigned int pass; 456 unsigned int pass;
457 int error; 457 int error;
458 int jlocked = 0;
458 459
459 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) { 460 if (sdp->sd_args.ar_spectator ||
461 (jd->jd_jid != sdp->sd_lockstruct.ls_jid)) {
460 fs_info(sdp, "jid=%u: Trying to acquire journal lock...\n", 462 fs_info(sdp, "jid=%u: Trying to acquire journal lock...\n",
461 jd->jd_jid); 463 jd->jd_jid);
462 464 jlocked = 1;
463 /* Acquire the journal lock so we can do recovery */ 465 /* Acquire the journal lock so we can do recovery */
464 466
465 error = gfs2_glock_nq_num(sdp, jd->jd_jid, &gfs2_journal_glops, 467 error = gfs2_glock_nq_num(sdp, jd->jd_jid, &gfs2_journal_glops,
@@ -554,13 +556,12 @@ void gfs2_recover_func(struct work_struct *work)
554 jd->jd_jid, t); 556 jd->jd_jid, t);
555 } 557 }
556 558
557 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid)
558 gfs2_glock_dq_uninit(&ji_gh);
559
560 gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS); 559 gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS);
561 560
562 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) 561 if (jlocked) {
562 gfs2_glock_dq_uninit(&ji_gh);
563 gfs2_glock_dq_uninit(&j_gh); 563 gfs2_glock_dq_uninit(&j_gh);
564 }
564 565
565 fs_info(sdp, "jid=%u: Done\n", jd->jd_jid); 566 fs_info(sdp, "jid=%u: Done\n", jd->jd_jid);
566 goto done; 567 goto done;
@@ -568,7 +569,7 @@ void gfs2_recover_func(struct work_struct *work)
568fail_gunlock_tr: 569fail_gunlock_tr:
569 gfs2_glock_dq_uninit(&t_gh); 570 gfs2_glock_dq_uninit(&t_gh);
570fail_gunlock_ji: 571fail_gunlock_ji:
571 if (jd->jd_jid != sdp->sd_lockstruct.ls_jid) { 572 if (jlocked) {
572 gfs2_glock_dq_uninit(&ji_gh); 573 gfs2_glock_dq_uninit(&ji_gh);
573fail_gunlock_j: 574fail_gunlock_j:
574 gfs2_glock_dq_uninit(&j_gh); 575 gfs2_glock_dq_uninit(&j_gh);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 171a744f8e45..fb67f593f408 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -500,7 +500,7 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp)
500 for (rgrps = 0;; rgrps++) { 500 for (rgrps = 0;; rgrps++) {
501 loff_t pos = rgrps * sizeof(struct gfs2_rindex); 501 loff_t pos = rgrps * sizeof(struct gfs2_rindex);
502 502
503 if (pos + sizeof(struct gfs2_rindex) >= ip->i_disksize) 503 if (pos + sizeof(struct gfs2_rindex) >= i_size_read(inode))
504 break; 504 break;
505 error = gfs2_internal_read(ip, &ra_state, buf, &pos, 505 error = gfs2_internal_read(ip, &ra_state, buf, &pos,
506 sizeof(struct gfs2_rindex)); 506 sizeof(struct gfs2_rindex));
@@ -588,7 +588,9 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
588 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 588 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
589 struct inode *inode = &ip->i_inode; 589 struct inode *inode = &ip->i_inode;
590 struct file_ra_state ra_state; 590 struct file_ra_state ra_state;
591 u64 rgrp_count = ip->i_disksize; 591 u64 rgrp_count = i_size_read(inode);
592 struct gfs2_rgrpd *rgd;
593 unsigned int max_data = 0;
592 int error; 594 int error;
593 595
594 do_div(rgrp_count, sizeof(struct gfs2_rindex)); 596 do_div(rgrp_count, sizeof(struct gfs2_rindex));
@@ -603,6 +605,10 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
603 } 605 }
604 } 606 }
605 607
608 list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list)
609 if (rgd->rd_data > max_data)
610 max_data = rgd->rd_data;
611 sdp->sd_max_rg_data = max_data;
606 sdp->sd_rindex_uptodate = 1; 612 sdp->sd_rindex_uptodate = 1;
607 return 0; 613 return 0;
608} 614}
@@ -622,13 +628,15 @@ static int gfs2_ri_update_special(struct gfs2_inode *ip)
622 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 628 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
623 struct inode *inode = &ip->i_inode; 629 struct inode *inode = &ip->i_inode;
624 struct file_ra_state ra_state; 630 struct file_ra_state ra_state;
631 struct gfs2_rgrpd *rgd;
632 unsigned int max_data = 0;
625 int error; 633 int error;
626 634
627 file_ra_state_init(&ra_state, inode->i_mapping); 635 file_ra_state_init(&ra_state, inode->i_mapping);
628 for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) { 636 for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) {
629 /* Ignore partials */ 637 /* Ignore partials */
630 if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) > 638 if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) >
631 ip->i_disksize) 639 i_size_read(inode))
632 break; 640 break;
633 error = read_rindex_entry(ip, &ra_state); 641 error = read_rindex_entry(ip, &ra_state);
634 if (error) { 642 if (error) {
@@ -636,6 +644,10 @@ static int gfs2_ri_update_special(struct gfs2_inode *ip)
636 return error; 644 return error;
637 } 645 }
638 } 646 }
647 list_for_each_entry(rgd, &sdp->sd_rindex_list, rd_list)
648 if (rgd->rd_data > max_data)
649 max_data = rgd->rd_data;
650 sdp->sd_max_rg_data = max_data;
639 651
640 sdp->sd_rindex_uptodate = 1; 652 sdp->sd_rindex_uptodate = 1;
641 return 0; 653 return 0;
@@ -1188,7 +1200,8 @@ out:
1188 * Returns: errno 1200 * Returns: errno
1189 */ 1201 */
1190 1202
1191int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line) 1203int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
1204 char *file, unsigned int line)
1192{ 1205{
1193 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1206 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1194 struct gfs2_alloc *al = ip->i_alloc; 1207 struct gfs2_alloc *al = ip->i_alloc;
@@ -1199,12 +1212,15 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
1199 return -EINVAL; 1212 return -EINVAL;
1200 1213
1201try_again: 1214try_again:
1202 /* We need to hold the rindex unless the inode we're using is 1215 if (hold_rindex) {
1203 the rindex itself, in which case it's already held. */ 1216 /* We need to hold the rindex unless the inode we're using is
1204 if (ip != GFS2_I(sdp->sd_rindex)) 1217 the rindex itself, in which case it's already held. */
1205 error = gfs2_rindex_hold(sdp, &al->al_ri_gh); 1218 if (ip != GFS2_I(sdp->sd_rindex))
1206 else if (!sdp->sd_rgrps) /* We may not have the rindex read in, so: */ 1219 error = gfs2_rindex_hold(sdp, &al->al_ri_gh);
1207 error = gfs2_ri_update_special(ip); 1220 else if (!sdp->sd_rgrps) /* We may not have the rindex read
1221 in, so: */
1222 error = gfs2_ri_update_special(ip);
1223 }
1208 1224
1209 if (error) 1225 if (error)
1210 return error; 1226 return error;
@@ -1215,7 +1231,7 @@ try_again:
1215 try to free it, and try the allocation again. */ 1231 try to free it, and try the allocation again. */
1216 error = get_local_rgrp(ip, &unlinked, &last_unlinked); 1232 error = get_local_rgrp(ip, &unlinked, &last_unlinked);
1217 if (error) { 1233 if (error) {
1218 if (ip != GFS2_I(sdp->sd_rindex)) 1234 if (hold_rindex && ip != GFS2_I(sdp->sd_rindex))
1219 gfs2_glock_dq_uninit(&al->al_ri_gh); 1235 gfs2_glock_dq_uninit(&al->al_ri_gh);
1220 if (error != -EAGAIN) 1236 if (error != -EAGAIN)
1221 return error; 1237 return error;
@@ -1257,7 +1273,7 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
1257 al->al_rgd = NULL; 1273 al->al_rgd = NULL;
1258 if (al->al_rgd_gh.gh_gl) 1274 if (al->al_rgd_gh.gh_gl)
1259 gfs2_glock_dq_uninit(&al->al_rgd_gh); 1275 gfs2_glock_dq_uninit(&al->al_rgd_gh);
1260 if (ip != GFS2_I(sdp->sd_rindex)) 1276 if (ip != GFS2_I(sdp->sd_rindex) && al->al_ri_gh.gh_gl)
1261 gfs2_glock_dq_uninit(&al->al_ri_gh); 1277 gfs2_glock_dq_uninit(&al->al_ri_gh);
1262} 1278}
1263 1279
@@ -1496,11 +1512,19 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n)
1496 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1512 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1497 struct buffer_head *dibh; 1513 struct buffer_head *dibh;
1498 struct gfs2_alloc *al = ip->i_alloc; 1514 struct gfs2_alloc *al = ip->i_alloc;
1499 struct gfs2_rgrpd *rgd = al->al_rgd; 1515 struct gfs2_rgrpd *rgd;
1500 u32 goal, blk; 1516 u32 goal, blk;
1501 u64 block; 1517 u64 block;
1502 int error; 1518 int error;
1503 1519
1520 /* Only happens if there is a bug in gfs2, return something distinctive
1521 * to ensure that it is noticed.
1522 */
1523 if (al == NULL)
1524 return -ECANCELED;
1525
1526 rgd = al->al_rgd;
1527
1504 if (rgrp_contains_block(rgd, ip->i_goal)) 1528 if (rgrp_contains_block(rgd, ip->i_goal))
1505 goal = ip->i_goal - rgd->rd_data0; 1529 goal = ip->i_goal - rgd->rd_data0;
1506 else 1530 else
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index f07119d89557..0e35c0466f9a 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -39,10 +39,12 @@ static inline void gfs2_alloc_put(struct gfs2_inode *ip)
39 ip->i_alloc = NULL; 39 ip->i_alloc = NULL;
40} 40}
41 41
42extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, 42extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, int hold_rindex,
43 unsigned int line); 43 char *file, unsigned int line);
44#define gfs2_inplace_reserve(ip) \ 44#define gfs2_inplace_reserve(ip) \
45gfs2_inplace_reserve_i((ip), __FILE__, __LINE__) 45 gfs2_inplace_reserve_i((ip), 1, __FILE__, __LINE__)
46#define gfs2_inplace_reserve_ri(ip) \
47 gfs2_inplace_reserve_i((ip), 0, __FILE__, __LINE__)
46 48
47extern void gfs2_inplace_release(struct gfs2_inode *ip); 49extern void gfs2_inplace_release(struct gfs2_inode *ip);
48 50
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 77cb9f830ee4..047d1176096c 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -85,6 +85,7 @@ static const match_table_t tokens = {
85 {Opt_locktable, "locktable=%s"}, 85 {Opt_locktable, "locktable=%s"},
86 {Opt_hostdata, "hostdata=%s"}, 86 {Opt_hostdata, "hostdata=%s"},
87 {Opt_spectator, "spectator"}, 87 {Opt_spectator, "spectator"},
88 {Opt_spectator, "norecovery"},
88 {Opt_ignore_local_fs, "ignore_local_fs"}, 89 {Opt_ignore_local_fs, "ignore_local_fs"},
89 {Opt_localflocks, "localflocks"}, 90 {Opt_localflocks, "localflocks"},
90 {Opt_localcaching, "localcaching"}, 91 {Opt_localcaching, "localcaching"},
@@ -159,13 +160,13 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
159 args->ar_spectator = 1; 160 args->ar_spectator = 1;
160 break; 161 break;
161 case Opt_ignore_local_fs: 162 case Opt_ignore_local_fs:
162 args->ar_ignore_local_fs = 1; 163 /* Retained for backwards compat only */
163 break; 164 break;
164 case Opt_localflocks: 165 case Opt_localflocks:
165 args->ar_localflocks = 1; 166 args->ar_localflocks = 1;
166 break; 167 break;
167 case Opt_localcaching: 168 case Opt_localcaching:
168 args->ar_localcaching = 1; 169 /* Retained for backwards compat only */
169 break; 170 break;
170 case Opt_debug: 171 case Opt_debug:
171 if (args->ar_errors == GFS2_ERRORS_PANIC) { 172 if (args->ar_errors == GFS2_ERRORS_PANIC) {
@@ -179,7 +180,7 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
179 args->ar_debug = 0; 180 args->ar_debug = 0;
180 break; 181 break;
181 case Opt_upgrade: 182 case Opt_upgrade:
182 args->ar_upgrade = 1; 183 /* Retained for backwards compat only */
183 break; 184 break;
184 case Opt_acl: 185 case Opt_acl:
185 args->ar_posix_acl = 1; 186 args->ar_posix_acl = 1;
@@ -342,15 +343,14 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd)
342{ 343{
343 struct gfs2_inode *ip = GFS2_I(jd->jd_inode); 344 struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
344 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); 345 struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
346 u64 size = i_size_read(jd->jd_inode);
345 347
346 if (ip->i_disksize < (8 << 20) || ip->i_disksize > (1 << 30) || 348 if (gfs2_check_internal_file_size(jd->jd_inode, 8 << 20, 1 << 30))
347 (ip->i_disksize & (sdp->sd_sb.sb_bsize - 1))) {
348 gfs2_consist_inode(ip);
349 return -EIO; 349 return -EIO;
350 }
351 jd->jd_blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift;
352 350
353 if (gfs2_write_alloc_required(ip, 0, ip->i_disksize)) { 351 jd->jd_blocks = size >> sdp->sd_sb.sb_bsize_shift;
352
353 if (gfs2_write_alloc_required(ip, 0, size)) {
354 gfs2_consist_inode(ip); 354 gfs2_consist_inode(ip);
355 return -EIO; 355 return -EIO;
356 } 356 }
@@ -1129,9 +1129,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
1129 1129
1130 /* Some flags must not be changed */ 1130 /* Some flags must not be changed */
1131 if (args_neq(&args, &sdp->sd_args, spectator) || 1131 if (args_neq(&args, &sdp->sd_args, spectator) ||
1132 args_neq(&args, &sdp->sd_args, ignore_local_fs) ||
1133 args_neq(&args, &sdp->sd_args, localflocks) || 1132 args_neq(&args, &sdp->sd_args, localflocks) ||
1134 args_neq(&args, &sdp->sd_args, localcaching) ||
1135 args_neq(&args, &sdp->sd_args, meta)) 1133 args_neq(&args, &sdp->sd_args, meta))
1136 return -EINVAL; 1134 return -EINVAL;
1137 1135
@@ -1234,16 +1232,10 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
1234 seq_printf(s, ",hostdata=%s", args->ar_hostdata); 1232 seq_printf(s, ",hostdata=%s", args->ar_hostdata);
1235 if (args->ar_spectator) 1233 if (args->ar_spectator)
1236 seq_printf(s, ",spectator"); 1234 seq_printf(s, ",spectator");
1237 if (args->ar_ignore_local_fs)
1238 seq_printf(s, ",ignore_local_fs");
1239 if (args->ar_localflocks) 1235 if (args->ar_localflocks)
1240 seq_printf(s, ",localflocks"); 1236 seq_printf(s, ",localflocks");
1241 if (args->ar_localcaching)
1242 seq_printf(s, ",localcaching");
1243 if (args->ar_debug) 1237 if (args->ar_debug)
1244 seq_printf(s, ",debug"); 1238 seq_printf(s, ",debug");
1245 if (args->ar_upgrade)
1246 seq_printf(s, ",upgrade");
1247 if (args->ar_posix_acl) 1239 if (args->ar_posix_acl)
1248 seq_printf(s, ",acl"); 1240 seq_printf(s, ",acl");
1249 if (args->ar_quota != GFS2_QUOTA_DEFAULT) { 1241 if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index ccacffd2faaa..748ccb557c18 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -230,7 +230,10 @@ static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len
230 230
231 if (gltype > LM_TYPE_JOURNAL) 231 if (gltype > LM_TYPE_JOURNAL)
232 return -EINVAL; 232 return -EINVAL;
233 glops = gfs2_glops_list[gltype]; 233 if (gltype == LM_TYPE_NONDISK && glnum == GFS2_TRANS_LOCK)
234 glops = &gfs2_trans_glops;
235 else
236 glops = gfs2_glops_list[gltype];
234 if (glops == NULL) 237 if (glops == NULL)
235 return -EINVAL; 238 return -EINVAL;
236 if (!test_and_set_bit(SDF_DEMOTE, &sdp->sd_flags)) 239 if (!test_and_set_bit(SDF_DEMOTE, &sdp->sd_flags))
@@ -399,31 +402,32 @@ static ssize_t recover_status_show(struct gfs2_sbd *sdp, char *buf)
399 402
400static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf) 403static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf)
401{ 404{
402 return sprintf(buf, "%u\n", sdp->sd_lockstruct.ls_jid); 405 return sprintf(buf, "%d\n", sdp->sd_lockstruct.ls_jid);
403} 406}
404 407
405static ssize_t jid_store(struct gfs2_sbd *sdp, const char *buf, size_t len) 408static ssize_t jid_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
406{ 409{
407 unsigned jid; 410 int jid;
408 int rv; 411 int rv;
409 412
410 rv = sscanf(buf, "%u", &jid); 413 rv = sscanf(buf, "%d", &jid);
411 if (rv != 1) 414 if (rv != 1)
412 return -EINVAL; 415 return -EINVAL;
413 416
414 spin_lock(&sdp->sd_jindex_spin); 417 spin_lock(&sdp->sd_jindex_spin);
415 rv = -EINVAL; 418 rv = -EINVAL;
416 if (sdp->sd_args.ar_spectator)
417 goto out;
418 if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL) 419 if (sdp->sd_lockstruct.ls_ops->lm_mount == NULL)
419 goto out; 420 goto out;
420 rv = -EBUSY; 421 rv = -EBUSY;
421 if (test_and_clear_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0) 422 if (test_bit(SDF_NOJOURNALID, &sdp->sd_flags) == 0)
422 goto out; 423 goto out;
424 rv = 0;
425 if (sdp->sd_args.ar_spectator && jid > 0)
426 rv = jid = -EINVAL;
423 sdp->sd_lockstruct.ls_jid = jid; 427 sdp->sd_lockstruct.ls_jid = jid;
428 clear_bit(SDF_NOJOURNALID, &sdp->sd_flags);
424 smp_mb__after_clear_bit(); 429 smp_mb__after_clear_bit();
425 wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID); 430 wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID);
426 rv = 0;
427out: 431out:
428 spin_unlock(&sdp->sd_jindex_spin); 432 spin_unlock(&sdp->sd_jindex_spin);
429 return rv ? rv : len; 433 return rv ? rv : len;
@@ -617,7 +621,7 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
617 add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name); 621 add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name);
618 add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name); 622 add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name);
619 if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags)) 623 if (!test_bit(SDF_NOJOURNALID, &sdp->sd_flags))
620 add_uevent_var(env, "JOURNALID=%u", sdp->sd_lockstruct.ls_jid); 624 add_uevent_var(env, "JOURNALID=%d", sdp->sd_lockstruct.ls_jid);
621 if (gfs2_uuid_valid(uuid)) 625 if (gfs2_uuid_valid(uuid))
622 add_uevent_var(env, "UUID=%pUB", uuid); 626 add_uevent_var(env, "UUID=%pUB", uuid);
623 return 0; 627 return 0;
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
index 148d55c14171..cedb0bb96d96 100644
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@@ -39,7 +39,8 @@
39 {(1UL << GLF_INVALIDATE_IN_PROGRESS), "i" }, \ 39 {(1UL << GLF_INVALIDATE_IN_PROGRESS), "i" }, \
40 {(1UL << GLF_REPLY_PENDING), "r" }, \ 40 {(1UL << GLF_REPLY_PENDING), "r" }, \
41 {(1UL << GLF_INITIAL), "I" }, \ 41 {(1UL << GLF_INITIAL), "I" }, \
42 {(1UL << GLF_FROZEN), "F" }) 42 {(1UL << GLF_FROZEN), "F" }, \
43 {(1UL << GLF_QUEUED), "q" })
43 44
44#ifndef NUMPTY 45#ifndef NUMPTY
45#define NUMPTY 46#define NUMPTY
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
index edf9d4bd908e..fb56b783e028 100644
--- a/fs/gfs2/trans.h
+++ b/fs/gfs2/trans.h
@@ -20,11 +20,20 @@ struct gfs2_glock;
20#define RES_JDATA 1 20#define RES_JDATA 1
21#define RES_DATA 1 21#define RES_DATA 1
22#define RES_LEAF 1 22#define RES_LEAF 1
23#define RES_RG_HDR 1
23#define RES_RG_BIT 2 24#define RES_RG_BIT 2
24#define RES_EATTR 1 25#define RES_EATTR 1
25#define RES_STATFS 1 26#define RES_STATFS 1
26#define RES_QUOTA 2 27#define RES_QUOTA 2
27 28
29/* reserve either the number of blocks to be allocated plus the rg header
30 * block, or all of the blocks in the rg, whichever is smaller */
31static inline unsigned int gfs2_rg_blocks(const struct gfs2_alloc *al)
32{
33 return (al->al_requested < al->al_rgd->rd_length)?
34 al->al_requested + 1 : al->al_rgd->rd_length;
35}
36
28int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, 37int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
29 unsigned int revokes); 38 unsigned int revokes);
30 39
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 776af6eb4bcb..30b58f07c8a6 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -734,7 +734,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
734 goto out_gunlock_q; 734 goto out_gunlock_q;
735 735
736 error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), 736 error = gfs2_trans_begin(GFS2_SB(&ip->i_inode),
737 blks + al->al_rgd->rd_length + 737 blks + gfs2_rg_blocks(al) +
738 RES_DINODE + RES_STATFS + RES_QUOTA, 0); 738 RES_DINODE + RES_STATFS + RES_QUOTA, 0);
739 if (error) 739 if (error)
740 goto out_ipres; 740 goto out_ipres;
diff --git a/fs/hfs/bfind.c b/fs/hfs/bfind.c
index 4129cdb3f0d8..571abe97b42a 100644
--- a/fs/hfs/bfind.c
+++ b/fs/hfs/bfind.c
@@ -23,7 +23,7 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd)
23 fd->search_key = ptr; 23 fd->search_key = ptr;
24 fd->key = ptr + tree->max_key_len + 2; 24 fd->key = ptr + tree->max_key_len + 2;
25 dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0)); 25 dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0));
26 down(&tree->tree_lock); 26 mutex_lock(&tree->tree_lock);
27 return 0; 27 return 0;
28} 28}
29 29
@@ -32,7 +32,7 @@ void hfs_find_exit(struct hfs_find_data *fd)
32 hfs_bnode_put(fd->bnode); 32 hfs_bnode_put(fd->bnode);
33 kfree(fd->search_key); 33 kfree(fd->search_key);
34 dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0)); 34 dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0));
35 up(&fd->tree->tree_lock); 35 mutex_unlock(&fd->tree->tree_lock);
36 fd->tree = NULL; 36 fd->tree = NULL;
37} 37}
38 38
diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c
index 38a0a9917d7f..3ebc437736fe 100644
--- a/fs/hfs/btree.c
+++ b/fs/hfs/btree.c
@@ -27,7 +27,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id, btree_keycmp ke
27 if (!tree) 27 if (!tree)
28 return NULL; 28 return NULL;
29 29
30 init_MUTEX(&tree->tree_lock); 30 mutex_init(&tree->tree_lock);
31 spin_lock_init(&tree->hash_lock); 31 spin_lock_init(&tree->hash_lock);
32 /* Set the correct compare function */ 32 /* Set the correct compare function */
33 tree->sb = sb; 33 tree->sb = sb;
diff --git a/fs/hfs/btree.h b/fs/hfs/btree.h
index cc51905ac21d..2a1d712f85dc 100644
--- a/fs/hfs/btree.h
+++ b/fs/hfs/btree.h
@@ -33,7 +33,7 @@ struct hfs_btree {
33 unsigned int depth; 33 unsigned int depth;
34 34
35 //unsigned int map1_size, map_size; 35 //unsigned int map1_size, map_size;
36 struct semaphore tree_lock; 36 struct mutex tree_lock;
37 37
38 unsigned int pages_per_bnode; 38 unsigned int pages_per_bnode;
39 spinlock_t hash_lock; 39 spinlock_t hash_lock;
diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c
index 5007a41f1be9..d182438c7ae4 100644
--- a/fs/hfsplus/bfind.c
+++ b/fs/hfsplus/bfind.c
@@ -23,7 +23,7 @@ int hfs_find_init(struct hfs_btree *tree, struct hfs_find_data *fd)
23 fd->search_key = ptr; 23 fd->search_key = ptr;
24 fd->key = ptr + tree->max_key_len + 2; 24 fd->key = ptr + tree->max_key_len + 2;
25 dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0)); 25 dprint(DBG_BNODE_REFS, "find_init: %d (%p)\n", tree->cnid, __builtin_return_address(0));
26 down(&tree->tree_lock); 26 mutex_lock(&tree->tree_lock);
27 return 0; 27 return 0;
28} 28}
29 29
@@ -32,7 +32,7 @@ void hfs_find_exit(struct hfs_find_data *fd)
32 hfs_bnode_put(fd->bnode); 32 hfs_bnode_put(fd->bnode);
33 kfree(fd->search_key); 33 kfree(fd->search_key);
34 dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0)); 34 dprint(DBG_BNODE_REFS, "find_exit: %d (%p)\n", fd->tree->cnid, __builtin_return_address(0));
35 up(&fd->tree->tree_lock); 35 mutex_unlock(&fd->tree->tree_lock);
36 fd->tree = NULL; 36 fd->tree = NULL;
37} 37}
38 38
@@ -52,6 +52,10 @@ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd)
52 rec = (e + b) / 2; 52 rec = (e + b) / 2;
53 len = hfs_brec_lenoff(bnode, rec, &off); 53 len = hfs_brec_lenoff(bnode, rec, &off);
54 keylen = hfs_brec_keylen(bnode, rec); 54 keylen = hfs_brec_keylen(bnode, rec);
55 if (keylen == 0) {
56 res = -EINVAL;
57 goto fail;
58 }
55 hfs_bnode_read(bnode, fd->key, off, keylen); 59 hfs_bnode_read(bnode, fd->key, off, keylen);
56 cmpval = bnode->tree->keycmp(fd->key, fd->search_key); 60 cmpval = bnode->tree->keycmp(fd->key, fd->search_key);
57 if (!cmpval) { 61 if (!cmpval) {
@@ -67,6 +71,10 @@ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd)
67 if (rec != e && e >= 0) { 71 if (rec != e && e >= 0) {
68 len = hfs_brec_lenoff(bnode, e, &off); 72 len = hfs_brec_lenoff(bnode, e, &off);
69 keylen = hfs_brec_keylen(bnode, e); 73 keylen = hfs_brec_keylen(bnode, e);
74 if (keylen == 0) {
75 res = -EINVAL;
76 goto fail;
77 }
70 hfs_bnode_read(bnode, fd->key, off, keylen); 78 hfs_bnode_read(bnode, fd->key, off, keylen);
71 } 79 }
72done: 80done:
@@ -75,6 +83,7 @@ done:
75 fd->keylength = keylen; 83 fd->keylength = keylen;
76 fd->entryoffset = off + keylen; 84 fd->entryoffset = off + keylen;
77 fd->entrylength = len - keylen; 85 fd->entrylength = len - keylen;
86fail:
78 return res; 87 return res;
79} 88}
80 89
@@ -198,6 +207,10 @@ int hfs_brec_goto(struct hfs_find_data *fd, int cnt)
198 207
199 len = hfs_brec_lenoff(bnode, fd->record, &off); 208 len = hfs_brec_lenoff(bnode, fd->record, &off);
200 keylen = hfs_brec_keylen(bnode, fd->record); 209 keylen = hfs_brec_keylen(bnode, fd->record);
210 if (keylen == 0) {
211 res = -EINVAL;
212 goto out;
213 }
201 fd->keyoffset = off; 214 fd->keyoffset = off;
202 fd->keylength = keylen; 215 fd->keylength = keylen;
203 fd->entryoffset = off + keylen; 216 fd->entryoffset = off + keylen;
diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c
index ea30afc2a03c..ad57f5991eb1 100644
--- a/fs/hfsplus/bitmap.c
+++ b/fs/hfsplus/bitmap.c
@@ -17,6 +17,7 @@
17 17
18int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *max) 18int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *max)
19{ 19{
20 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
20 struct page *page; 21 struct page *page;
21 struct address_space *mapping; 22 struct address_space *mapping;
22 __be32 *pptr, *curr, *end; 23 __be32 *pptr, *curr, *end;
@@ -29,8 +30,8 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *ma
29 return size; 30 return size;
30 31
31 dprint(DBG_BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len); 32 dprint(DBG_BITMAP, "block_allocate: %u,%u,%u\n", size, offset, len);
32 mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex); 33 mutex_lock(&sbi->alloc_mutex);
33 mapping = HFSPLUS_SB(sb).alloc_file->i_mapping; 34 mapping = sbi->alloc_file->i_mapping;
34 page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL); 35 page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL);
35 if (IS_ERR(page)) { 36 if (IS_ERR(page)) {
36 start = size; 37 start = size;
@@ -150,16 +151,17 @@ done:
150 set_page_dirty(page); 151 set_page_dirty(page);
151 kunmap(page); 152 kunmap(page);
152 *max = offset + (curr - pptr) * 32 + i - start; 153 *max = offset + (curr - pptr) * 32 + i - start;
153 HFSPLUS_SB(sb).free_blocks -= *max; 154 sbi->free_blocks -= *max;
154 sb->s_dirt = 1; 155 sb->s_dirt = 1;
155 dprint(DBG_BITMAP, "-> %u,%u\n", start, *max); 156 dprint(DBG_BITMAP, "-> %u,%u\n", start, *max);
156out: 157out:
157 mutex_unlock(&HFSPLUS_SB(sb).alloc_file->i_mutex); 158 mutex_unlock(&sbi->alloc_mutex);
158 return start; 159 return start;
159} 160}
160 161
161int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count) 162int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
162{ 163{
164 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
163 struct page *page; 165 struct page *page;
164 struct address_space *mapping; 166 struct address_space *mapping;
165 __be32 *pptr, *curr, *end; 167 __be32 *pptr, *curr, *end;
@@ -172,11 +174,11 @@ int hfsplus_block_free(struct super_block *sb, u32 offset, u32 count)
172 174
173 dprint(DBG_BITMAP, "block_free: %u,%u\n", offset, count); 175 dprint(DBG_BITMAP, "block_free: %u,%u\n", offset, count);
174 /* are all of the bits in range? */ 176 /* are all of the bits in range? */
175 if ((offset + count) > HFSPLUS_SB(sb).total_blocks) 177 if ((offset + count) > sbi->total_blocks)
176 return -2; 178 return -2;
177 179
178 mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex); 180 mutex_lock(&sbi->alloc_mutex);
179 mapping = HFSPLUS_SB(sb).alloc_file->i_mapping; 181 mapping = sbi->alloc_file->i_mapping;
180 pnr = offset / PAGE_CACHE_BITS; 182 pnr = offset / PAGE_CACHE_BITS;
181 page = read_mapping_page(mapping, pnr, NULL); 183 page = read_mapping_page(mapping, pnr, NULL);
182 pptr = kmap(page); 184 pptr = kmap(page);
@@ -224,9 +226,9 @@ done:
224out: 226out:
225 set_page_dirty(page); 227 set_page_dirty(page);
226 kunmap(page); 228 kunmap(page);
227 HFSPLUS_SB(sb).free_blocks += len; 229 sbi->free_blocks += len;
228 sb->s_dirt = 1; 230 sb->s_dirt = 1;
229 mutex_unlock(&HFSPLUS_SB(sb).alloc_file->i_mutex); 231 mutex_unlock(&sbi->alloc_mutex);
230 232
231 return 0; 233 return 0;
232} 234}
diff --git a/fs/hfsplus/brec.c b/fs/hfsplus/brec.c
index c88e5d72a402..2f39d05443e1 100644
--- a/fs/hfsplus/brec.c
+++ b/fs/hfsplus/brec.c
@@ -42,10 +42,13 @@ u16 hfs_brec_keylen(struct hfs_bnode *node, u16 rec)
42 recoff = hfs_bnode_read_u16(node, node->tree->node_size - (rec + 1) * 2); 42 recoff = hfs_bnode_read_u16(node, node->tree->node_size - (rec + 1) * 2);
43 if (!recoff) 43 if (!recoff)
44 return 0; 44 return 0;
45 if (node->tree->attributes & HFS_TREE_BIGKEYS) 45
46 retval = hfs_bnode_read_u16(node, recoff) + 2; 46 retval = hfs_bnode_read_u16(node, recoff) + 2;
47 else 47 if (retval > node->tree->max_key_len + 2) {
48 retval = (hfs_bnode_read_u8(node, recoff) | 1) + 1; 48 printk(KERN_ERR "hfs: keylen %d too large\n",
49 retval);
50 retval = 0;
51 }
49 } 52 }
50 return retval; 53 return retval;
51} 54}
@@ -216,7 +219,7 @@ skip:
216static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd) 219static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
217{ 220{
218 struct hfs_btree *tree; 221 struct hfs_btree *tree;
219 struct hfs_bnode *node, *new_node; 222 struct hfs_bnode *node, *new_node, *next_node;
220 struct hfs_bnode_desc node_desc; 223 struct hfs_bnode_desc node_desc;
221 int num_recs, new_rec_off, new_off, old_rec_off; 224 int num_recs, new_rec_off, new_off, old_rec_off;
222 int data_start, data_end, size; 225 int data_start, data_end, size;
@@ -235,6 +238,17 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
235 new_node->type = node->type; 238 new_node->type = node->type;
236 new_node->height = node->height; 239 new_node->height = node->height;
237 240
241 if (node->next)
242 next_node = hfs_bnode_find(tree, node->next);
243 else
244 next_node = NULL;
245
246 if (IS_ERR(next_node)) {
247 hfs_bnode_put(node);
248 hfs_bnode_put(new_node);
249 return next_node;
250 }
251
238 size = tree->node_size / 2 - node->num_recs * 2 - 14; 252 size = tree->node_size / 2 - node->num_recs * 2 - 14;
239 old_rec_off = tree->node_size - 4; 253 old_rec_off = tree->node_size - 4;
240 num_recs = 1; 254 num_recs = 1;
@@ -248,6 +262,8 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
248 /* panic? */ 262 /* panic? */
249 hfs_bnode_put(node); 263 hfs_bnode_put(node);
250 hfs_bnode_put(new_node); 264 hfs_bnode_put(new_node);
265 if (next_node)
266 hfs_bnode_put(next_node);
251 return ERR_PTR(-ENOSPC); 267 return ERR_PTR(-ENOSPC);
252 } 268 }
253 269
@@ -302,8 +318,7 @@ static struct hfs_bnode *hfs_bnode_split(struct hfs_find_data *fd)
302 hfs_bnode_write(node, &node_desc, 0, sizeof(node_desc)); 318 hfs_bnode_write(node, &node_desc, 0, sizeof(node_desc));
303 319
304 /* update next bnode header */ 320 /* update next bnode header */
305 if (new_node->next) { 321 if (next_node) {
306 struct hfs_bnode *next_node = hfs_bnode_find(tree, new_node->next);
307 next_node->prev = new_node->this; 322 next_node->prev = new_node->this;
308 hfs_bnode_read(next_node, &node_desc, 0, sizeof(node_desc)); 323 hfs_bnode_read(next_node, &node_desc, 0, sizeof(node_desc));
309 node_desc.prev = cpu_to_be32(next_node->prev); 324 node_desc.prev = cpu_to_be32(next_node->prev);
diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c
index e49fcee1e293..22e4d4e32999 100644
--- a/fs/hfsplus/btree.c
+++ b/fs/hfsplus/btree.c
@@ -30,7 +30,7 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
30 if (!tree) 30 if (!tree)
31 return NULL; 31 return NULL;
32 32
33 init_MUTEX(&tree->tree_lock); 33 mutex_init(&tree->tree_lock);
34 spin_lock_init(&tree->hash_lock); 34 spin_lock_init(&tree->hash_lock);
35 tree->sb = sb; 35 tree->sb = sb;
36 tree->cnid = id; 36 tree->cnid = id;
@@ -39,10 +39,16 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
39 goto free_tree; 39 goto free_tree;
40 tree->inode = inode; 40 tree->inode = inode;
41 41
42 if (!HFSPLUS_I(tree->inode)->first_blocks) {
43 printk(KERN_ERR
44 "hfs: invalid btree extent records (0 size).\n");
45 goto free_inode;
46 }
47
42 mapping = tree->inode->i_mapping; 48 mapping = tree->inode->i_mapping;
43 page = read_mapping_page(mapping, 0, NULL); 49 page = read_mapping_page(mapping, 0, NULL);
44 if (IS_ERR(page)) 50 if (IS_ERR(page))
45 goto free_tree; 51 goto free_inode;
46 52
47 /* Load the header */ 53 /* Load the header */
48 head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc)); 54 head = (struct hfs_btree_header_rec *)(kmap(page) + sizeof(struct hfs_bnode_desc));
@@ -57,27 +63,56 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
57 tree->max_key_len = be16_to_cpu(head->max_key_len); 63 tree->max_key_len = be16_to_cpu(head->max_key_len);
58 tree->depth = be16_to_cpu(head->depth); 64 tree->depth = be16_to_cpu(head->depth);
59 65
60 /* Set the correct compare function */ 66 /* Verify the tree and set the correct compare function */
61 if (id == HFSPLUS_EXT_CNID) { 67 switch (id) {
68 case HFSPLUS_EXT_CNID:
69 if (tree->max_key_len != HFSPLUS_EXT_KEYLEN - sizeof(u16)) {
70 printk(KERN_ERR "hfs: invalid extent max_key_len %d\n",
71 tree->max_key_len);
72 goto fail_page;
73 }
74 if (tree->attributes & HFS_TREE_VARIDXKEYS) {
75 printk(KERN_ERR "hfs: invalid extent btree flag\n");
76 goto fail_page;
77 }
78
62 tree->keycmp = hfsplus_ext_cmp_key; 79 tree->keycmp = hfsplus_ext_cmp_key;
63 } else if (id == HFSPLUS_CAT_CNID) { 80 break;
64 if ((HFSPLUS_SB(sb).flags & HFSPLUS_SB_HFSX) && 81 case HFSPLUS_CAT_CNID:
82 if (tree->max_key_len != HFSPLUS_CAT_KEYLEN - sizeof(u16)) {
83 printk(KERN_ERR "hfs: invalid catalog max_key_len %d\n",
84 tree->max_key_len);
85 goto fail_page;
86 }
87 if (!(tree->attributes & HFS_TREE_VARIDXKEYS)) {
88 printk(KERN_ERR "hfs: invalid catalog btree flag\n");
89 goto fail_page;
90 }
91
92 if (test_bit(HFSPLUS_SB_HFSX, &HFSPLUS_SB(sb)->flags) &&
65 (head->key_type == HFSPLUS_KEY_BINARY)) 93 (head->key_type == HFSPLUS_KEY_BINARY))
66 tree->keycmp = hfsplus_cat_bin_cmp_key; 94 tree->keycmp = hfsplus_cat_bin_cmp_key;
67 else { 95 else {
68 tree->keycmp = hfsplus_cat_case_cmp_key; 96 tree->keycmp = hfsplus_cat_case_cmp_key;
69 HFSPLUS_SB(sb).flags |= HFSPLUS_SB_CASEFOLD; 97 set_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
70 } 98 }
71 } else { 99 break;
100 default:
72 printk(KERN_ERR "hfs: unknown B*Tree requested\n"); 101 printk(KERN_ERR "hfs: unknown B*Tree requested\n");
73 goto fail_page; 102 goto fail_page;
74 } 103 }
75 104
105 if (!(tree->attributes & HFS_TREE_BIGKEYS)) {
106 printk(KERN_ERR "hfs: invalid btree flag\n");
107 goto fail_page;
108 }
109
76 size = tree->node_size; 110 size = tree->node_size;
77 if (!is_power_of_2(size)) 111 if (!is_power_of_2(size))
78 goto fail_page; 112 goto fail_page;
79 if (!tree->node_count) 113 if (!tree->node_count)
80 goto fail_page; 114 goto fail_page;
115
81 tree->node_size_shift = ffs(size) - 1; 116 tree->node_size_shift = ffs(size) - 1;
82 117
83 tree->pages_per_bnode = (tree->node_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 118 tree->pages_per_bnode = (tree->node_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
@@ -87,10 +122,11 @@ struct hfs_btree *hfs_btree_open(struct super_block *sb, u32 id)
87 return tree; 122 return tree;
88 123
89 fail_page: 124 fail_page:
90 tree->inode->i_mapping->a_ops = &hfsplus_aops;
91 page_cache_release(page); 125 page_cache_release(page);
92 free_tree: 126 free_inode:
127 tree->inode->i_mapping->a_ops = &hfsplus_aops;
93 iput(tree->inode); 128 iput(tree->inode);
129 free_tree:
94 kfree(tree); 130 kfree(tree);
95 return NULL; 131 return NULL;
96} 132}
@@ -192,17 +228,18 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree)
192 228
193 while (!tree->free_nodes) { 229 while (!tree->free_nodes) {
194 struct inode *inode = tree->inode; 230 struct inode *inode = tree->inode;
231 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
195 u32 count; 232 u32 count;
196 int res; 233 int res;
197 234
198 res = hfsplus_file_extend(inode); 235 res = hfsplus_file_extend(inode);
199 if (res) 236 if (res)
200 return ERR_PTR(res); 237 return ERR_PTR(res);
201 HFSPLUS_I(inode).phys_size = inode->i_size = 238 hip->phys_size = inode->i_size =
202 (loff_t)HFSPLUS_I(inode).alloc_blocks << 239 (loff_t)hip->alloc_blocks <<
203 HFSPLUS_SB(tree->sb).alloc_blksz_shift; 240 HFSPLUS_SB(tree->sb)->alloc_blksz_shift;
204 HFSPLUS_I(inode).fs_blocks = HFSPLUS_I(inode).alloc_blocks << 241 hip->fs_blocks =
205 HFSPLUS_SB(tree->sb).fs_shift; 242 hip->alloc_blocks << HFSPLUS_SB(tree->sb)->fs_shift;
206 inode_set_bytes(inode, inode->i_size); 243 inode_set_bytes(inode, inode->i_size);
207 count = inode->i_size >> tree->node_size_shift; 244 count = inode->i_size >> tree->node_size_shift;
208 tree->free_nodes = count - tree->node_count; 245 tree->free_nodes = count - tree->node_count;
diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c
index f6874acb2cf2..8af45fc5b051 100644
--- a/fs/hfsplus/catalog.c
+++ b/fs/hfsplus/catalog.c
@@ -67,7 +67,7 @@ static void hfsplus_cat_build_key_uni(hfsplus_btree_key *key, u32 parent,
67 key->key_len = cpu_to_be16(6 + ustrlen); 67 key->key_len = cpu_to_be16(6 + ustrlen);
68} 68}
69 69
70static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms) 70void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms)
71{ 71{
72 if (inode->i_flags & S_IMMUTABLE) 72 if (inode->i_flags & S_IMMUTABLE)
73 perms->rootflags |= HFSPLUS_FLG_IMMUTABLE; 73 perms->rootflags |= HFSPLUS_FLG_IMMUTABLE;
@@ -77,15 +77,24 @@ static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms)
77 perms->rootflags |= HFSPLUS_FLG_APPEND; 77 perms->rootflags |= HFSPLUS_FLG_APPEND;
78 else 78 else
79 perms->rootflags &= ~HFSPLUS_FLG_APPEND; 79 perms->rootflags &= ~HFSPLUS_FLG_APPEND;
80 HFSPLUS_I(inode).rootflags = perms->rootflags; 80
81 HFSPLUS_I(inode).userflags = perms->userflags; 81 perms->userflags = HFSPLUS_I(inode)->userflags;
82 perms->mode = cpu_to_be16(inode->i_mode); 82 perms->mode = cpu_to_be16(inode->i_mode);
83 perms->owner = cpu_to_be32(inode->i_uid); 83 perms->owner = cpu_to_be32(inode->i_uid);
84 perms->group = cpu_to_be32(inode->i_gid); 84 perms->group = cpu_to_be32(inode->i_gid);
85
86 if (S_ISREG(inode->i_mode))
87 perms->dev = cpu_to_be32(inode->i_nlink);
88 else if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode))
89 perms->dev = cpu_to_be32(inode->i_rdev);
90 else
91 perms->dev = 0;
85} 92}
86 93
87static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct inode *inode) 94static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct inode *inode)
88{ 95{
96 struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
97
89 if (S_ISDIR(inode->i_mode)) { 98 if (S_ISDIR(inode->i_mode)) {
90 struct hfsplus_cat_folder *folder; 99 struct hfsplus_cat_folder *folder;
91 100
@@ -93,13 +102,13 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i
93 memset(folder, 0, sizeof(*folder)); 102 memset(folder, 0, sizeof(*folder));
94 folder->type = cpu_to_be16(HFSPLUS_FOLDER); 103 folder->type = cpu_to_be16(HFSPLUS_FOLDER);
95 folder->id = cpu_to_be32(inode->i_ino); 104 folder->id = cpu_to_be32(inode->i_ino);
96 HFSPLUS_I(inode).create_date = 105 HFSPLUS_I(inode)->create_date =
97 folder->create_date = 106 folder->create_date =
98 folder->content_mod_date = 107 folder->content_mod_date =
99 folder->attribute_mod_date = 108 folder->attribute_mod_date =
100 folder->access_date = hfsp_now2mt(); 109 folder->access_date = hfsp_now2mt();
101 hfsplus_set_perms(inode, &folder->permissions); 110 hfsplus_cat_set_perms(inode, &folder->permissions);
102 if (inode == HFSPLUS_SB(inode->i_sb).hidden_dir) 111 if (inode == sbi->hidden_dir)
103 /* invisible and namelocked */ 112 /* invisible and namelocked */
104 folder->user_info.frFlags = cpu_to_be16(0x5000); 113 folder->user_info.frFlags = cpu_to_be16(0x5000);
105 return sizeof(*folder); 114 return sizeof(*folder);
@@ -111,19 +120,19 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i
111 file->type = cpu_to_be16(HFSPLUS_FILE); 120 file->type = cpu_to_be16(HFSPLUS_FILE);
112 file->flags = cpu_to_be16(HFSPLUS_FILE_THREAD_EXISTS); 121 file->flags = cpu_to_be16(HFSPLUS_FILE_THREAD_EXISTS);
113 file->id = cpu_to_be32(cnid); 122 file->id = cpu_to_be32(cnid);
114 HFSPLUS_I(inode).create_date = 123 HFSPLUS_I(inode)->create_date =
115 file->create_date = 124 file->create_date =
116 file->content_mod_date = 125 file->content_mod_date =
117 file->attribute_mod_date = 126 file->attribute_mod_date =
118 file->access_date = hfsp_now2mt(); 127 file->access_date = hfsp_now2mt();
119 if (cnid == inode->i_ino) { 128 if (cnid == inode->i_ino) {
120 hfsplus_set_perms(inode, &file->permissions); 129 hfsplus_cat_set_perms(inode, &file->permissions);
121 if (S_ISLNK(inode->i_mode)) { 130 if (S_ISLNK(inode->i_mode)) {
122 file->user_info.fdType = cpu_to_be32(HFSP_SYMLINK_TYPE); 131 file->user_info.fdType = cpu_to_be32(HFSP_SYMLINK_TYPE);
123 file->user_info.fdCreator = cpu_to_be32(HFSP_SYMLINK_CREATOR); 132 file->user_info.fdCreator = cpu_to_be32(HFSP_SYMLINK_CREATOR);
124 } else { 133 } else {
125 file->user_info.fdType = cpu_to_be32(HFSPLUS_SB(inode->i_sb).type); 134 file->user_info.fdType = cpu_to_be32(sbi->type);
126 file->user_info.fdCreator = cpu_to_be32(HFSPLUS_SB(inode->i_sb).creator); 135 file->user_info.fdCreator = cpu_to_be32(sbi->creator);
127 } 136 }
128 if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE) 137 if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE)
129 file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED); 138 file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED);
@@ -131,8 +140,8 @@ static int hfsplus_cat_build_record(hfsplus_cat_entry *entry, u32 cnid, struct i
131 file->user_info.fdType = cpu_to_be32(HFSP_HARDLINK_TYPE); 140 file->user_info.fdType = cpu_to_be32(HFSP_HARDLINK_TYPE);
132 file->user_info.fdCreator = cpu_to_be32(HFSP_HFSPLUS_CREATOR); 141 file->user_info.fdCreator = cpu_to_be32(HFSP_HFSPLUS_CREATOR);
133 file->user_info.fdFlags = cpu_to_be16(0x100); 142 file->user_info.fdFlags = cpu_to_be16(0x100);
134 file->create_date = HFSPLUS_I(HFSPLUS_SB(inode->i_sb).hidden_dir).create_date; 143 file->create_date = HFSPLUS_I(sbi->hidden_dir)->create_date;
135 file->permissions.dev = cpu_to_be32(HFSPLUS_I(inode).dev); 144 file->permissions.dev = cpu_to_be32(HFSPLUS_I(inode)->linkid);
136 } 145 }
137 return sizeof(*file); 146 return sizeof(*file);
138 } 147 }
@@ -180,15 +189,14 @@ int hfsplus_find_cat(struct super_block *sb, u32 cnid,
180 189
181int hfsplus_create_cat(u32 cnid, struct inode *dir, struct qstr *str, struct inode *inode) 190int hfsplus_create_cat(u32 cnid, struct inode *dir, struct qstr *str, struct inode *inode)
182{ 191{
192 struct super_block *sb = dir->i_sb;
183 struct hfs_find_data fd; 193 struct hfs_find_data fd;
184 struct super_block *sb;
185 hfsplus_cat_entry entry; 194 hfsplus_cat_entry entry;
186 int entry_size; 195 int entry_size;
187 int err; 196 int err;
188 197
189 dprint(DBG_CAT_MOD, "create_cat: %s,%u(%d)\n", str->name, cnid, inode->i_nlink); 198 dprint(DBG_CAT_MOD, "create_cat: %s,%u(%d)\n", str->name, cnid, inode->i_nlink);
190 sb = dir->i_sb; 199 hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
191 hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
192 200
193 hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL); 201 hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL);
194 entry_size = hfsplus_fill_cat_thread(sb, &entry, S_ISDIR(inode->i_mode) ? 202 entry_size = hfsplus_fill_cat_thread(sb, &entry, S_ISDIR(inode->i_mode) ?
@@ -234,7 +242,7 @@ err2:
234 242
235int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str) 243int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
236{ 244{
237 struct super_block *sb; 245 struct super_block *sb = dir->i_sb;
238 struct hfs_find_data fd; 246 struct hfs_find_data fd;
239 struct hfsplus_fork_raw fork; 247 struct hfsplus_fork_raw fork;
240 struct list_head *pos; 248 struct list_head *pos;
@@ -242,8 +250,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
242 u16 type; 250 u16 type;
243 251
244 dprint(DBG_CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid); 252 dprint(DBG_CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid);
245 sb = dir->i_sb; 253 hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
246 hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd);
247 254
248 if (!str) { 255 if (!str) {
249 int len; 256 int len;
@@ -279,7 +286,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str)
279 hfsplus_free_fork(sb, cnid, &fork, HFSPLUS_TYPE_RSRC); 286 hfsplus_free_fork(sb, cnid, &fork, HFSPLUS_TYPE_RSRC);
280 } 287 }
281 288
282 list_for_each(pos, &HFSPLUS_I(dir).open_dir_list) { 289 list_for_each(pos, &HFSPLUS_I(dir)->open_dir_list) {
283 struct hfsplus_readdir_data *rd = 290 struct hfsplus_readdir_data *rd =
284 list_entry(pos, struct hfsplus_readdir_data, list); 291 list_entry(pos, struct hfsplus_readdir_data, list);
285 if (fd.tree->keycmp(fd.search_key, (void *)&rd->key) < 0) 292 if (fd.tree->keycmp(fd.search_key, (void *)&rd->key) < 0)
@@ -312,7 +319,7 @@ int hfsplus_rename_cat(u32 cnid,
312 struct inode *src_dir, struct qstr *src_name, 319 struct inode *src_dir, struct qstr *src_name,
313 struct inode *dst_dir, struct qstr *dst_name) 320 struct inode *dst_dir, struct qstr *dst_name)
314{ 321{
315 struct super_block *sb; 322 struct super_block *sb = src_dir->i_sb;
316 struct hfs_find_data src_fd, dst_fd; 323 struct hfs_find_data src_fd, dst_fd;
317 hfsplus_cat_entry entry; 324 hfsplus_cat_entry entry;
318 int entry_size, type; 325 int entry_size, type;
@@ -320,8 +327,7 @@ int hfsplus_rename_cat(u32 cnid,
320 327
321 dprint(DBG_CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", cnid, src_dir->i_ino, src_name->name, 328 dprint(DBG_CAT_MOD, "rename_cat: %u - %lu,%s - %lu,%s\n", cnid, src_dir->i_ino, src_name->name,
322 dst_dir->i_ino, dst_name->name); 329 dst_dir->i_ino, dst_name->name);
323 sb = src_dir->i_sb; 330 hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &src_fd);
324 hfs_find_init(HFSPLUS_SB(sb).cat_tree, &src_fd);
325 dst_fd = src_fd; 331 dst_fd = src_fd;
326 332
327 /* find the old dir entry and read the data */ 333 /* find the old dir entry and read the data */
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 764fd1bdca88..d236d85ec9d7 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -39,7 +39,7 @@ static struct dentry *hfsplus_lookup(struct inode *dir, struct dentry *dentry,
39 39
40 dentry->d_op = &hfsplus_dentry_operations; 40 dentry->d_op = &hfsplus_dentry_operations;
41 dentry->d_fsdata = NULL; 41 dentry->d_fsdata = NULL;
42 hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd); 42 hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
43 hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, &dentry->d_name); 43 hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, &dentry->d_name);
44again: 44again:
45 err = hfs_brec_read(&fd, &entry, sizeof(entry)); 45 err = hfs_brec_read(&fd, &entry, sizeof(entry));
@@ -68,9 +68,9 @@ again:
68 cnid = be32_to_cpu(entry.file.id); 68 cnid = be32_to_cpu(entry.file.id);
69 if (entry.file.user_info.fdType == cpu_to_be32(HFSP_HARDLINK_TYPE) && 69 if (entry.file.user_info.fdType == cpu_to_be32(HFSP_HARDLINK_TYPE) &&
70 entry.file.user_info.fdCreator == cpu_to_be32(HFSP_HFSPLUS_CREATOR) && 70 entry.file.user_info.fdCreator == cpu_to_be32(HFSP_HFSPLUS_CREATOR) &&
71 (entry.file.create_date == HFSPLUS_I(HFSPLUS_SB(sb).hidden_dir).create_date || 71 (entry.file.create_date == HFSPLUS_I(HFSPLUS_SB(sb)->hidden_dir)->create_date ||
72 entry.file.create_date == HFSPLUS_I(sb->s_root->d_inode).create_date) && 72 entry.file.create_date == HFSPLUS_I(sb->s_root->d_inode)->create_date) &&
73 HFSPLUS_SB(sb).hidden_dir) { 73 HFSPLUS_SB(sb)->hidden_dir) {
74 struct qstr str; 74 struct qstr str;
75 char name[32]; 75 char name[32];
76 76
@@ -86,7 +86,8 @@ again:
86 linkid = be32_to_cpu(entry.file.permissions.dev); 86 linkid = be32_to_cpu(entry.file.permissions.dev);
87 str.len = sprintf(name, "iNode%d", linkid); 87 str.len = sprintf(name, "iNode%d", linkid);
88 str.name = name; 88 str.name = name;
89 hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_SB(sb).hidden_dir->i_ino, &str); 89 hfsplus_cat_build_key(sb, fd.search_key,
90 HFSPLUS_SB(sb)->hidden_dir->i_ino, &str);
90 goto again; 91 goto again;
91 } 92 }
92 } else if (!dentry->d_fsdata) 93 } else if (!dentry->d_fsdata)
@@ -101,7 +102,7 @@ again:
101 if (IS_ERR(inode)) 102 if (IS_ERR(inode))
102 return ERR_CAST(inode); 103 return ERR_CAST(inode);
103 if (S_ISREG(inode->i_mode)) 104 if (S_ISREG(inode->i_mode))
104 HFSPLUS_I(inode).dev = linkid; 105 HFSPLUS_I(inode)->linkid = linkid;
105out: 106out:
106 d_add(dentry, inode); 107 d_add(dentry, inode);
107 return NULL; 108 return NULL;
@@ -124,7 +125,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
124 if (filp->f_pos >= inode->i_size) 125 if (filp->f_pos >= inode->i_size)
125 return 0; 126 return 0;
126 127
127 hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd); 128 hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
128 hfsplus_cat_build_key(sb, fd.search_key, inode->i_ino, NULL); 129 hfsplus_cat_build_key(sb, fd.search_key, inode->i_ino, NULL);
129 err = hfs_brec_find(&fd); 130 err = hfs_brec_find(&fd);
130 if (err) 131 if (err)
@@ -180,8 +181,9 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
180 err = -EIO; 181 err = -EIO;
181 goto out; 182 goto out;
182 } 183 }
183 if (HFSPLUS_SB(sb).hidden_dir && 184 if (HFSPLUS_SB(sb)->hidden_dir &&
184 HFSPLUS_SB(sb).hidden_dir->i_ino == be32_to_cpu(entry.folder.id)) 185 HFSPLUS_SB(sb)->hidden_dir->i_ino ==
186 be32_to_cpu(entry.folder.id))
185 goto next; 187 goto next;
186 if (filldir(dirent, strbuf, len, filp->f_pos, 188 if (filldir(dirent, strbuf, len, filp->f_pos,
187 be32_to_cpu(entry.folder.id), DT_DIR)) 189 be32_to_cpu(entry.folder.id), DT_DIR))
@@ -217,7 +219,7 @@ static int hfsplus_readdir(struct file *filp, void *dirent, filldir_t filldir)
217 } 219 }
218 filp->private_data = rd; 220 filp->private_data = rd;
219 rd->file = filp; 221 rd->file = filp;
220 list_add(&rd->list, &HFSPLUS_I(inode).open_dir_list); 222 list_add(&rd->list, &HFSPLUS_I(inode)->open_dir_list);
221 } 223 }
222 memcpy(&rd->key, fd.key, sizeof(struct hfsplus_cat_key)); 224 memcpy(&rd->key, fd.key, sizeof(struct hfsplus_cat_key));
223out: 225out:
@@ -229,38 +231,18 @@ static int hfsplus_dir_release(struct inode *inode, struct file *file)
229{ 231{
230 struct hfsplus_readdir_data *rd = file->private_data; 232 struct hfsplus_readdir_data *rd = file->private_data;
231 if (rd) { 233 if (rd) {
234 mutex_lock(&inode->i_mutex);
232 list_del(&rd->list); 235 list_del(&rd->list);
236 mutex_unlock(&inode->i_mutex);
233 kfree(rd); 237 kfree(rd);
234 } 238 }
235 return 0; 239 return 0;
236} 240}
237 241
238static int hfsplus_create(struct inode *dir, struct dentry *dentry, int mode,
239 struct nameidata *nd)
240{
241 struct inode *inode;
242 int res;
243
244 inode = hfsplus_new_inode(dir->i_sb, mode);
245 if (!inode)
246 return -ENOSPC;
247
248 res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
249 if (res) {
250 inode->i_nlink = 0;
251 hfsplus_delete_inode(inode);
252 iput(inode);
253 return res;
254 }
255 hfsplus_instantiate(dentry, inode, inode->i_ino);
256 mark_inode_dirty(inode);
257 return 0;
258}
259
260static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir, 242static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
261 struct dentry *dst_dentry) 243 struct dentry *dst_dentry)
262{ 244{
263 struct super_block *sb = dst_dir->i_sb; 245 struct hfsplus_sb_info *sbi = HFSPLUS_SB(dst_dir->i_sb);
264 struct inode *inode = src_dentry->d_inode; 246 struct inode *inode = src_dentry->d_inode;
265 struct inode *src_dir = src_dentry->d_parent->d_inode; 247 struct inode *src_dir = src_dentry->d_parent->d_inode;
266 struct qstr str; 248 struct qstr str;
@@ -270,7 +252,10 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
270 252
271 if (HFSPLUS_IS_RSRC(inode)) 253 if (HFSPLUS_IS_RSRC(inode))
272 return -EPERM; 254 return -EPERM;
255 if (!S_ISREG(inode->i_mode))
256 return -EPERM;
273 257
258 mutex_lock(&sbi->vh_mutex);
274 if (inode->i_ino == (u32)(unsigned long)src_dentry->d_fsdata) { 259 if (inode->i_ino == (u32)(unsigned long)src_dentry->d_fsdata) {
275 for (;;) { 260 for (;;) {
276 get_random_bytes(&id, sizeof(cnid)); 261 get_random_bytes(&id, sizeof(cnid));
@@ -279,40 +264,41 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir,
279 str.len = sprintf(name, "iNode%d", id); 264 str.len = sprintf(name, "iNode%d", id);
280 res = hfsplus_rename_cat(inode->i_ino, 265 res = hfsplus_rename_cat(inode->i_ino,
281 src_dir, &src_dentry->d_name, 266 src_dir, &src_dentry->d_name,
282 HFSPLUS_SB(sb).hidden_dir, &str); 267 sbi->hidden_dir, &str);
283 if (!res) 268 if (!res)
284 break; 269 break;
285 if (res != -EEXIST) 270 if (res != -EEXIST)
286 return res; 271 goto out;
287 } 272 }
288 HFSPLUS_I(inode).dev = id; 273 HFSPLUS_I(inode)->linkid = id;
289 cnid = HFSPLUS_SB(sb).next_cnid++; 274 cnid = sbi->next_cnid++;
290 src_dentry->d_fsdata = (void *)(unsigned long)cnid; 275 src_dentry->d_fsdata = (void *)(unsigned long)cnid;
291 res = hfsplus_create_cat(cnid, src_dir, &src_dentry->d_name, inode); 276 res = hfsplus_create_cat(cnid, src_dir, &src_dentry->d_name, inode);
292 if (res) 277 if (res)
293 /* panic? */ 278 /* panic? */
294 return res; 279 goto out;
295 HFSPLUS_SB(sb).file_count++; 280 sbi->file_count++;
296 } 281 }
297 cnid = HFSPLUS_SB(sb).next_cnid++; 282 cnid = sbi->next_cnid++;
298 res = hfsplus_create_cat(cnid, dst_dir, &dst_dentry->d_name, inode); 283 res = hfsplus_create_cat(cnid, dst_dir, &dst_dentry->d_name, inode);
299 if (res) 284 if (res)
300 return res; 285 goto out;
301 286
302 inc_nlink(inode); 287 inc_nlink(inode);
303 hfsplus_instantiate(dst_dentry, inode, cnid); 288 hfsplus_instantiate(dst_dentry, inode, cnid);
304 atomic_inc(&inode->i_count); 289 atomic_inc(&inode->i_count);
305 inode->i_ctime = CURRENT_TIME_SEC; 290 inode->i_ctime = CURRENT_TIME_SEC;
306 mark_inode_dirty(inode); 291 mark_inode_dirty(inode);
307 HFSPLUS_SB(sb).file_count++; 292 sbi->file_count++;
308 sb->s_dirt = 1; 293 dst_dir->i_sb->s_dirt = 1;
309 294out:
310 return 0; 295 mutex_unlock(&sbi->vh_mutex);
296 return res;
311} 297}
312 298
313static int hfsplus_unlink(struct inode *dir, struct dentry *dentry) 299static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
314{ 300{
315 struct super_block *sb = dir->i_sb; 301 struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
316 struct inode *inode = dentry->d_inode; 302 struct inode *inode = dentry->d_inode;
317 struct qstr str; 303 struct qstr str;
318 char name[32]; 304 char name[32];
@@ -322,21 +308,22 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
322 if (HFSPLUS_IS_RSRC(inode)) 308 if (HFSPLUS_IS_RSRC(inode))
323 return -EPERM; 309 return -EPERM;
324 310
311 mutex_lock(&sbi->vh_mutex);
325 cnid = (u32)(unsigned long)dentry->d_fsdata; 312 cnid = (u32)(unsigned long)dentry->d_fsdata;
326 if (inode->i_ino == cnid && 313 if (inode->i_ino == cnid &&
327 atomic_read(&HFSPLUS_I(inode).opencnt)) { 314 atomic_read(&HFSPLUS_I(inode)->opencnt)) {
328 str.name = name; 315 str.name = name;
329 str.len = sprintf(name, "temp%lu", inode->i_ino); 316 str.len = sprintf(name, "temp%lu", inode->i_ino);
330 res = hfsplus_rename_cat(inode->i_ino, 317 res = hfsplus_rename_cat(inode->i_ino,
331 dir, &dentry->d_name, 318 dir, &dentry->d_name,
332 HFSPLUS_SB(sb).hidden_dir, &str); 319 sbi->hidden_dir, &str);
333 if (!res) 320 if (!res)
334 inode->i_flags |= S_DEAD; 321 inode->i_flags |= S_DEAD;
335 return res; 322 goto out;
336 } 323 }
337 res = hfsplus_delete_cat(cnid, dir, &dentry->d_name); 324 res = hfsplus_delete_cat(cnid, dir, &dentry->d_name);
338 if (res) 325 if (res)
339 return res; 326 goto out;
340 327
341 if (inode->i_nlink > 0) 328 if (inode->i_nlink > 0)
342 drop_nlink(inode); 329 drop_nlink(inode);
@@ -344,10 +331,10 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
344 clear_nlink(inode); 331 clear_nlink(inode);
345 if (!inode->i_nlink) { 332 if (!inode->i_nlink) {
346 if (inode->i_ino != cnid) { 333 if (inode->i_ino != cnid) {
347 HFSPLUS_SB(sb).file_count--; 334 sbi->file_count--;
348 if (!atomic_read(&HFSPLUS_I(inode).opencnt)) { 335 if (!atomic_read(&HFSPLUS_I(inode)->opencnt)) {
349 res = hfsplus_delete_cat(inode->i_ino, 336 res = hfsplus_delete_cat(inode->i_ino,
350 HFSPLUS_SB(sb).hidden_dir, 337 sbi->hidden_dir,
351 NULL); 338 NULL);
352 if (!res) 339 if (!res)
353 hfsplus_delete_inode(inode); 340 hfsplus_delete_inode(inode);
@@ -356,107 +343,108 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry)
356 } else 343 } else
357 hfsplus_delete_inode(inode); 344 hfsplus_delete_inode(inode);
358 } else 345 } else
359 HFSPLUS_SB(sb).file_count--; 346 sbi->file_count--;
360 inode->i_ctime = CURRENT_TIME_SEC; 347 inode->i_ctime = CURRENT_TIME_SEC;
361 mark_inode_dirty(inode); 348 mark_inode_dirty(inode);
362 349out:
350 mutex_unlock(&sbi->vh_mutex);
363 return res; 351 return res;
364} 352}
365 353
366static int hfsplus_mkdir(struct inode *dir, struct dentry *dentry, int mode)
367{
368 struct inode *inode;
369 int res;
370
371 inode = hfsplus_new_inode(dir->i_sb, S_IFDIR | mode);
372 if (!inode)
373 return -ENOSPC;
374
375 res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
376 if (res) {
377 inode->i_nlink = 0;
378 hfsplus_delete_inode(inode);
379 iput(inode);
380 return res;
381 }
382 hfsplus_instantiate(dentry, inode, inode->i_ino);
383 mark_inode_dirty(inode);
384 return 0;
385}
386
387static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry) 354static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry)
388{ 355{
389 struct inode *inode; 356 struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
357 struct inode *inode = dentry->d_inode;
390 int res; 358 int res;
391 359
392 inode = dentry->d_inode;
393 if (inode->i_size != 2) 360 if (inode->i_size != 2)
394 return -ENOTEMPTY; 361 return -ENOTEMPTY;
362
363 mutex_lock(&sbi->vh_mutex);
395 res = hfsplus_delete_cat(inode->i_ino, dir, &dentry->d_name); 364 res = hfsplus_delete_cat(inode->i_ino, dir, &dentry->d_name);
396 if (res) 365 if (res)
397 return res; 366 goto out;
398 clear_nlink(inode); 367 clear_nlink(inode);
399 inode->i_ctime = CURRENT_TIME_SEC; 368 inode->i_ctime = CURRENT_TIME_SEC;
400 hfsplus_delete_inode(inode); 369 hfsplus_delete_inode(inode);
401 mark_inode_dirty(inode); 370 mark_inode_dirty(inode);
402 return 0; 371out:
372 mutex_unlock(&sbi->vh_mutex);
373 return res;
403} 374}
404 375
405static int hfsplus_symlink(struct inode *dir, struct dentry *dentry, 376static int hfsplus_symlink(struct inode *dir, struct dentry *dentry,
406 const char *symname) 377 const char *symname)
407{ 378{
408 struct super_block *sb; 379 struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
409 struct inode *inode; 380 struct inode *inode;
410 int res; 381 int res = -ENOSPC;
411 382
412 sb = dir->i_sb; 383 mutex_lock(&sbi->vh_mutex);
413 inode = hfsplus_new_inode(sb, S_IFLNK | S_IRWXUGO); 384 inode = hfsplus_new_inode(dir->i_sb, S_IFLNK | S_IRWXUGO);
414 if (!inode) 385 if (!inode)
415 return -ENOSPC; 386 goto out;
416 387
417 res = page_symlink(inode, symname, strlen(symname) + 1); 388 res = page_symlink(inode, symname, strlen(symname) + 1);
418 if (res) { 389 if (res)
419 inode->i_nlink = 0; 390 goto out_err;
420 hfsplus_delete_inode(inode);
421 iput(inode);
422 return res;
423 }
424 391
425 mark_inode_dirty(inode);
426 res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode); 392 res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
393 if (res)
394 goto out_err;
427 395
428 if (!res) { 396 hfsplus_instantiate(dentry, inode, inode->i_ino);
429 hfsplus_instantiate(dentry, inode, inode->i_ino); 397 mark_inode_dirty(inode);
430 mark_inode_dirty(inode); 398 goto out;
431 }
432 399
400out_err:
401 inode->i_nlink = 0;
402 hfsplus_delete_inode(inode);
403 iput(inode);
404out:
405 mutex_unlock(&sbi->vh_mutex);
433 return res; 406 return res;
434} 407}
435 408
436static int hfsplus_mknod(struct inode *dir, struct dentry *dentry, 409static int hfsplus_mknod(struct inode *dir, struct dentry *dentry,
437 int mode, dev_t rdev) 410 int mode, dev_t rdev)
438{ 411{
439 struct super_block *sb; 412 struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb);
440 struct inode *inode; 413 struct inode *inode;
441 int res; 414 int res = -ENOSPC;
442 415
443 sb = dir->i_sb; 416 mutex_lock(&sbi->vh_mutex);
444 inode = hfsplus_new_inode(sb, mode); 417 inode = hfsplus_new_inode(dir->i_sb, mode);
445 if (!inode) 418 if (!inode)
446 return -ENOSPC; 419 goto out;
420
421 if (S_ISBLK(mode) || S_ISCHR(mode) || S_ISFIFO(mode) || S_ISSOCK(mode))
422 init_special_inode(inode, mode, rdev);
447 423
448 res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode); 424 res = hfsplus_create_cat(inode->i_ino, dir, &dentry->d_name, inode);
449 if (res) { 425 if (res) {
450 inode->i_nlink = 0; 426 inode->i_nlink = 0;
451 hfsplus_delete_inode(inode); 427 hfsplus_delete_inode(inode);
452 iput(inode); 428 iput(inode);
453 return res; 429 goto out;
454 } 430 }
455 init_special_inode(inode, mode, rdev); 431
456 hfsplus_instantiate(dentry, inode, inode->i_ino); 432 hfsplus_instantiate(dentry, inode, inode->i_ino);
457 mark_inode_dirty(inode); 433 mark_inode_dirty(inode);
434out:
435 mutex_unlock(&sbi->vh_mutex);
436 return res;
437}
458 438
459 return 0; 439static int hfsplus_create(struct inode *dir, struct dentry *dentry, int mode,
440 struct nameidata *nd)
441{
442 return hfsplus_mknod(dir, dentry, mode, 0);
443}
444
445static int hfsplus_mkdir(struct inode *dir, struct dentry *dentry, int mode)
446{
447 return hfsplus_mknod(dir, dentry, mode | S_IFDIR, 0);
460} 448}
461 449
462static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry, 450static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry,
@@ -466,7 +454,10 @@ static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry,
466 454
467 /* Unlink destination if it already exists */ 455 /* Unlink destination if it already exists */
468 if (new_dentry->d_inode) { 456 if (new_dentry->d_inode) {
469 res = hfsplus_unlink(new_dir, new_dentry); 457 if (S_ISDIR(new_dentry->d_inode->i_mode))
458 res = hfsplus_rmdir(new_dir, new_dentry);
459 else
460 res = hfsplus_unlink(new_dir, new_dentry);
470 if (res) 461 if (res)
471 return res; 462 return res;
472 } 463 }
diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c
index 0022eec63cda..0c9cb1820a52 100644
--- a/fs/hfsplus/extents.c
+++ b/fs/hfsplus/extents.c
@@ -85,35 +85,49 @@ static u32 hfsplus_ext_lastblock(struct hfsplus_extent *ext)
85 85
86static void __hfsplus_ext_write_extent(struct inode *inode, struct hfs_find_data *fd) 86static void __hfsplus_ext_write_extent(struct inode *inode, struct hfs_find_data *fd)
87{ 87{
88 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
88 int res; 89 int res;
89 90
90 hfsplus_ext_build_key(fd->search_key, inode->i_ino, HFSPLUS_I(inode).cached_start, 91 WARN_ON(!mutex_is_locked(&hip->extents_lock));
91 HFSPLUS_IS_RSRC(inode) ? HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA); 92
93 hfsplus_ext_build_key(fd->search_key, inode->i_ino, hip->cached_start,
94 HFSPLUS_IS_RSRC(inode) ?
95 HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA);
96
92 res = hfs_brec_find(fd); 97 res = hfs_brec_find(fd);
93 if (HFSPLUS_I(inode).flags & HFSPLUS_FLG_EXT_NEW) { 98 if (hip->flags & HFSPLUS_FLG_EXT_NEW) {
94 if (res != -ENOENT) 99 if (res != -ENOENT)
95 return; 100 return;
96 hfs_brec_insert(fd, HFSPLUS_I(inode).cached_extents, sizeof(hfsplus_extent_rec)); 101 hfs_brec_insert(fd, hip->cached_extents,
97 HFSPLUS_I(inode).flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW); 102 sizeof(hfsplus_extent_rec));
103 hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
98 } else { 104 } else {
99 if (res) 105 if (res)
100 return; 106 return;
101 hfs_bnode_write(fd->bnode, HFSPLUS_I(inode).cached_extents, fd->entryoffset, fd->entrylength); 107 hfs_bnode_write(fd->bnode, hip->cached_extents,
102 HFSPLUS_I(inode).flags &= ~HFSPLUS_FLG_EXT_DIRTY; 108 fd->entryoffset, fd->entrylength);
109 hip->flags &= ~HFSPLUS_FLG_EXT_DIRTY;
103 } 110 }
104} 111}
105 112
106void hfsplus_ext_write_extent(struct inode *inode) 113static void hfsplus_ext_write_extent_locked(struct inode *inode)
107{ 114{
108 if (HFSPLUS_I(inode).flags & HFSPLUS_FLG_EXT_DIRTY) { 115 if (HFSPLUS_I(inode)->flags & HFSPLUS_FLG_EXT_DIRTY) {
109 struct hfs_find_data fd; 116 struct hfs_find_data fd;
110 117
111 hfs_find_init(HFSPLUS_SB(inode->i_sb).ext_tree, &fd); 118 hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd);
112 __hfsplus_ext_write_extent(inode, &fd); 119 __hfsplus_ext_write_extent(inode, &fd);
113 hfs_find_exit(&fd); 120 hfs_find_exit(&fd);
114 } 121 }
115} 122}
116 123
124void hfsplus_ext_write_extent(struct inode *inode)
125{
126 mutex_lock(&HFSPLUS_I(inode)->extents_lock);
127 hfsplus_ext_write_extent_locked(inode);
128 mutex_unlock(&HFSPLUS_I(inode)->extents_lock);
129}
130
117static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd, 131static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd,
118 struct hfsplus_extent *extent, 132 struct hfsplus_extent *extent,
119 u32 cnid, u32 block, u8 type) 133 u32 cnid, u32 block, u8 type)
@@ -136,33 +150,39 @@ static inline int __hfsplus_ext_read_extent(struct hfs_find_data *fd,
136 150
137static inline int __hfsplus_ext_cache_extent(struct hfs_find_data *fd, struct inode *inode, u32 block) 151static inline int __hfsplus_ext_cache_extent(struct hfs_find_data *fd, struct inode *inode, u32 block)
138{ 152{
153 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
139 int res; 154 int res;
140 155
141 if (HFSPLUS_I(inode).flags & HFSPLUS_FLG_EXT_DIRTY) 156 WARN_ON(!mutex_is_locked(&hip->extents_lock));
157
158 if (hip->flags & HFSPLUS_FLG_EXT_DIRTY)
142 __hfsplus_ext_write_extent(inode, fd); 159 __hfsplus_ext_write_extent(inode, fd);
143 160
144 res = __hfsplus_ext_read_extent(fd, HFSPLUS_I(inode).cached_extents, inode->i_ino, 161 res = __hfsplus_ext_read_extent(fd, hip->cached_extents, inode->i_ino,
145 block, HFSPLUS_IS_RSRC(inode) ? HFSPLUS_TYPE_RSRC : HFSPLUS_TYPE_DATA); 162 block, HFSPLUS_IS_RSRC(inode) ?
163 HFSPLUS_TYPE_RSRC :
164 HFSPLUS_TYPE_DATA);
146 if (!res) { 165 if (!res) {
147 HFSPLUS_I(inode).cached_start = be32_to_cpu(fd->key->ext.start_block); 166 hip->cached_start = be32_to_cpu(fd->key->ext.start_block);
148 HFSPLUS_I(inode).cached_blocks = hfsplus_ext_block_count(HFSPLUS_I(inode).cached_extents); 167 hip->cached_blocks = hfsplus_ext_block_count(hip->cached_extents);
149 } else { 168 } else {
150 HFSPLUS_I(inode).cached_start = HFSPLUS_I(inode).cached_blocks = 0; 169 hip->cached_start = hip->cached_blocks = 0;
151 HFSPLUS_I(inode).flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW); 170 hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
152 } 171 }
153 return res; 172 return res;
154} 173}
155 174
156static int hfsplus_ext_read_extent(struct inode *inode, u32 block) 175static int hfsplus_ext_read_extent(struct inode *inode, u32 block)
157{ 176{
177 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
158 struct hfs_find_data fd; 178 struct hfs_find_data fd;
159 int res; 179 int res;
160 180
161 if (block >= HFSPLUS_I(inode).cached_start && 181 if (block >= hip->cached_start &&
162 block < HFSPLUS_I(inode).cached_start + HFSPLUS_I(inode).cached_blocks) 182 block < hip->cached_start + hip->cached_blocks)
163 return 0; 183 return 0;
164 184
165 hfs_find_init(HFSPLUS_SB(inode->i_sb).ext_tree, &fd); 185 hfs_find_init(HFSPLUS_SB(inode->i_sb)->ext_tree, &fd);
166 res = __hfsplus_ext_cache_extent(&fd, inode, block); 186 res = __hfsplus_ext_cache_extent(&fd, inode, block);
167 hfs_find_exit(&fd); 187 hfs_find_exit(&fd);
168 return res; 188 return res;
@@ -172,21 +192,21 @@ static int hfsplus_ext_read_extent(struct inode *inode, u32 block)
172int hfsplus_get_block(struct inode *inode, sector_t iblock, 192int hfsplus_get_block(struct inode *inode, sector_t iblock,
173 struct buffer_head *bh_result, int create) 193 struct buffer_head *bh_result, int create)
174{ 194{
175 struct super_block *sb; 195 struct super_block *sb = inode->i_sb;
196 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
197 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
176 int res = -EIO; 198 int res = -EIO;
177 u32 ablock, dblock, mask; 199 u32 ablock, dblock, mask;
178 int shift; 200 int shift;
179 201
180 sb = inode->i_sb;
181
182 /* Convert inode block to disk allocation block */ 202 /* Convert inode block to disk allocation block */
183 shift = HFSPLUS_SB(sb).alloc_blksz_shift - sb->s_blocksize_bits; 203 shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits;
184 ablock = iblock >> HFSPLUS_SB(sb).fs_shift; 204 ablock = iblock >> sbi->fs_shift;
185 205
186 if (iblock >= HFSPLUS_I(inode).fs_blocks) { 206 if (iblock >= hip->fs_blocks) {
187 if (iblock > HFSPLUS_I(inode).fs_blocks || !create) 207 if (iblock > hip->fs_blocks || !create)
188 return -EIO; 208 return -EIO;
189 if (ablock >= HFSPLUS_I(inode).alloc_blocks) { 209 if (ablock >= hip->alloc_blocks) {
190 res = hfsplus_file_extend(inode); 210 res = hfsplus_file_extend(inode);
191 if (res) 211 if (res)
192 return res; 212 return res;
@@ -194,33 +214,33 @@ int hfsplus_get_block(struct inode *inode, sector_t iblock,
194 } else 214 } else
195 create = 0; 215 create = 0;
196 216
197 if (ablock < HFSPLUS_I(inode).first_blocks) { 217 if (ablock < hip->first_blocks) {
198 dblock = hfsplus_ext_find_block(HFSPLUS_I(inode).first_extents, ablock); 218 dblock = hfsplus_ext_find_block(hip->first_extents, ablock);
199 goto done; 219 goto done;
200 } 220 }
201 221
202 if (inode->i_ino == HFSPLUS_EXT_CNID) 222 if (inode->i_ino == HFSPLUS_EXT_CNID)
203 return -EIO; 223 return -EIO;
204 224
205 mutex_lock(&HFSPLUS_I(inode).extents_lock); 225 mutex_lock(&hip->extents_lock);
206 res = hfsplus_ext_read_extent(inode, ablock); 226 res = hfsplus_ext_read_extent(inode, ablock);
207 if (!res) { 227 if (!res) {
208 dblock = hfsplus_ext_find_block(HFSPLUS_I(inode).cached_extents, ablock - 228 dblock = hfsplus_ext_find_block(hip->cached_extents,
209 HFSPLUS_I(inode).cached_start); 229 ablock - hip->cached_start);
210 } else { 230 } else {
211 mutex_unlock(&HFSPLUS_I(inode).extents_lock); 231 mutex_unlock(&hip->extents_lock);
212 return -EIO; 232 return -EIO;
213 } 233 }
214 mutex_unlock(&HFSPLUS_I(inode).extents_lock); 234 mutex_unlock(&hip->extents_lock);
215 235
216done: 236done:
217 dprint(DBG_EXTENT, "get_block(%lu): %llu - %u\n", inode->i_ino, (long long)iblock, dblock); 237 dprint(DBG_EXTENT, "get_block(%lu): %llu - %u\n", inode->i_ino, (long long)iblock, dblock);
218 mask = (1 << HFSPLUS_SB(sb).fs_shift) - 1; 238 mask = (1 << sbi->fs_shift) - 1;
219 map_bh(bh_result, sb, (dblock << HFSPLUS_SB(sb).fs_shift) + HFSPLUS_SB(sb).blockoffset + (iblock & mask)); 239 map_bh(bh_result, sb, (dblock << sbi->fs_shift) + sbi->blockoffset + (iblock & mask));
220 if (create) { 240 if (create) {
221 set_buffer_new(bh_result); 241 set_buffer_new(bh_result);
222 HFSPLUS_I(inode).phys_size += sb->s_blocksize; 242 hip->phys_size += sb->s_blocksize;
223 HFSPLUS_I(inode).fs_blocks++; 243 hip->fs_blocks++;
224 inode_add_bytes(inode, sb->s_blocksize); 244 inode_add_bytes(inode, sb->s_blocksize);
225 mark_inode_dirty(inode); 245 mark_inode_dirty(inode);
226 } 246 }
@@ -327,7 +347,7 @@ int hfsplus_free_fork(struct super_block *sb, u32 cnid, struct hfsplus_fork_raw
327 if (total_blocks == blocks) 347 if (total_blocks == blocks)
328 return 0; 348 return 0;
329 349
330 hfs_find_init(HFSPLUS_SB(sb).ext_tree, &fd); 350 hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd);
331 do { 351 do {
332 res = __hfsplus_ext_read_extent(&fd, ext_entry, cnid, 352 res = __hfsplus_ext_read_extent(&fd, ext_entry, cnid,
333 total_blocks, type); 353 total_blocks, type);
@@ -348,29 +368,33 @@ int hfsplus_free_fork(struct super_block *sb, u32 cnid, struct hfsplus_fork_raw
348int hfsplus_file_extend(struct inode *inode) 368int hfsplus_file_extend(struct inode *inode)
349{ 369{
350 struct super_block *sb = inode->i_sb; 370 struct super_block *sb = inode->i_sb;
371 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
372 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
351 u32 start, len, goal; 373 u32 start, len, goal;
352 int res; 374 int res;
353 375
354 if (HFSPLUS_SB(sb).alloc_file->i_size * 8 < HFSPLUS_SB(sb).total_blocks - HFSPLUS_SB(sb).free_blocks + 8) { 376 if (sbi->alloc_file->i_size * 8 <
377 sbi->total_blocks - sbi->free_blocks + 8) {
355 // extend alloc file 378 // extend alloc file
356 printk(KERN_ERR "hfs: extend alloc file! (%Lu,%u,%u)\n", HFSPLUS_SB(sb).alloc_file->i_size * 8, 379 printk(KERN_ERR "hfs: extend alloc file! (%Lu,%u,%u)\n",
357 HFSPLUS_SB(sb).total_blocks, HFSPLUS_SB(sb).free_blocks); 380 sbi->alloc_file->i_size * 8,
381 sbi->total_blocks, sbi->free_blocks);
358 return -ENOSPC; 382 return -ENOSPC;
359 } 383 }
360 384
361 mutex_lock(&HFSPLUS_I(inode).extents_lock); 385 mutex_lock(&hip->extents_lock);
362 if (HFSPLUS_I(inode).alloc_blocks == HFSPLUS_I(inode).first_blocks) 386 if (hip->alloc_blocks == hip->first_blocks)
363 goal = hfsplus_ext_lastblock(HFSPLUS_I(inode).first_extents); 387 goal = hfsplus_ext_lastblock(hip->first_extents);
364 else { 388 else {
365 res = hfsplus_ext_read_extent(inode, HFSPLUS_I(inode).alloc_blocks); 389 res = hfsplus_ext_read_extent(inode, hip->alloc_blocks);
366 if (res) 390 if (res)
367 goto out; 391 goto out;
368 goal = hfsplus_ext_lastblock(HFSPLUS_I(inode).cached_extents); 392 goal = hfsplus_ext_lastblock(hip->cached_extents);
369 } 393 }
370 394
371 len = HFSPLUS_I(inode).clump_blocks; 395 len = hip->clump_blocks;
372 start = hfsplus_block_allocate(sb, HFSPLUS_SB(sb).total_blocks, goal, &len); 396 start = hfsplus_block_allocate(sb, sbi->total_blocks, goal, &len);
373 if (start >= HFSPLUS_SB(sb).total_blocks) { 397 if (start >= sbi->total_blocks) {
374 start = hfsplus_block_allocate(sb, goal, 0, &len); 398 start = hfsplus_block_allocate(sb, goal, 0, &len);
375 if (start >= goal) { 399 if (start >= goal) {
376 res = -ENOSPC; 400 res = -ENOSPC;
@@ -379,56 +403,56 @@ int hfsplus_file_extend(struct inode *inode)
379 } 403 }
380 404
381 dprint(DBG_EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len); 405 dprint(DBG_EXTENT, "extend %lu: %u,%u\n", inode->i_ino, start, len);
382 if (HFSPLUS_I(inode).alloc_blocks <= HFSPLUS_I(inode).first_blocks) { 406
383 if (!HFSPLUS_I(inode).first_blocks) { 407 if (hip->alloc_blocks <= hip->first_blocks) {
408 if (!hip->first_blocks) {
384 dprint(DBG_EXTENT, "first extents\n"); 409 dprint(DBG_EXTENT, "first extents\n");
385 /* no extents yet */ 410 /* no extents yet */
386 HFSPLUS_I(inode).first_extents[0].start_block = cpu_to_be32(start); 411 hip->first_extents[0].start_block = cpu_to_be32(start);
387 HFSPLUS_I(inode).first_extents[0].block_count = cpu_to_be32(len); 412 hip->first_extents[0].block_count = cpu_to_be32(len);
388 res = 0; 413 res = 0;
389 } else { 414 } else {
390 /* try to append to extents in inode */ 415 /* try to append to extents in inode */
391 res = hfsplus_add_extent(HFSPLUS_I(inode).first_extents, 416 res = hfsplus_add_extent(hip->first_extents,
392 HFSPLUS_I(inode).alloc_blocks, 417 hip->alloc_blocks,
393 start, len); 418 start, len);
394 if (res == -ENOSPC) 419 if (res == -ENOSPC)
395 goto insert_extent; 420 goto insert_extent;
396 } 421 }
397 if (!res) { 422 if (!res) {
398 hfsplus_dump_extent(HFSPLUS_I(inode).first_extents); 423 hfsplus_dump_extent(hip->first_extents);
399 HFSPLUS_I(inode).first_blocks += len; 424 hip->first_blocks += len;
400 } 425 }
401 } else { 426 } else {
402 res = hfsplus_add_extent(HFSPLUS_I(inode).cached_extents, 427 res = hfsplus_add_extent(hip->cached_extents,
403 HFSPLUS_I(inode).alloc_blocks - 428 hip->alloc_blocks - hip->cached_start,
404 HFSPLUS_I(inode).cached_start,
405 start, len); 429 start, len);
406 if (!res) { 430 if (!res) {
407 hfsplus_dump_extent(HFSPLUS_I(inode).cached_extents); 431 hfsplus_dump_extent(hip->cached_extents);
408 HFSPLUS_I(inode).flags |= HFSPLUS_FLG_EXT_DIRTY; 432 hip->flags |= HFSPLUS_FLG_EXT_DIRTY;
409 HFSPLUS_I(inode).cached_blocks += len; 433 hip->cached_blocks += len;
410 } else if (res == -ENOSPC) 434 } else if (res == -ENOSPC)
411 goto insert_extent; 435 goto insert_extent;
412 } 436 }
413out: 437out:
414 mutex_unlock(&HFSPLUS_I(inode).extents_lock); 438 mutex_unlock(&hip->extents_lock);
415 if (!res) { 439 if (!res) {
416 HFSPLUS_I(inode).alloc_blocks += len; 440 hip->alloc_blocks += len;
417 mark_inode_dirty(inode); 441 mark_inode_dirty(inode);
418 } 442 }
419 return res; 443 return res;
420 444
421insert_extent: 445insert_extent:
422 dprint(DBG_EXTENT, "insert new extent\n"); 446 dprint(DBG_EXTENT, "insert new extent\n");
423 hfsplus_ext_write_extent(inode); 447 hfsplus_ext_write_extent_locked(inode);
424 448
425 memset(HFSPLUS_I(inode).cached_extents, 0, sizeof(hfsplus_extent_rec)); 449 memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
426 HFSPLUS_I(inode).cached_extents[0].start_block = cpu_to_be32(start); 450 hip->cached_extents[0].start_block = cpu_to_be32(start);
427 HFSPLUS_I(inode).cached_extents[0].block_count = cpu_to_be32(len); 451 hip->cached_extents[0].block_count = cpu_to_be32(len);
428 hfsplus_dump_extent(HFSPLUS_I(inode).cached_extents); 452 hfsplus_dump_extent(hip->cached_extents);
429 HFSPLUS_I(inode).flags |= HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW; 453 hip->flags |= HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW;
430 HFSPLUS_I(inode).cached_start = HFSPLUS_I(inode).alloc_blocks; 454 hip->cached_start = hip->alloc_blocks;
431 HFSPLUS_I(inode).cached_blocks = len; 455 hip->cached_blocks = len;
432 456
433 res = 0; 457 res = 0;
434 goto out; 458 goto out;
@@ -437,13 +461,15 @@ insert_extent:
437void hfsplus_file_truncate(struct inode *inode) 461void hfsplus_file_truncate(struct inode *inode)
438{ 462{
439 struct super_block *sb = inode->i_sb; 463 struct super_block *sb = inode->i_sb;
464 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
440 struct hfs_find_data fd; 465 struct hfs_find_data fd;
441 u32 alloc_cnt, blk_cnt, start; 466 u32 alloc_cnt, blk_cnt, start;
442 int res; 467 int res;
443 468
444 dprint(DBG_INODE, "truncate: %lu, %Lu -> %Lu\n", inode->i_ino, 469 dprint(DBG_INODE, "truncate: %lu, %Lu -> %Lu\n",
445 (long long)HFSPLUS_I(inode).phys_size, inode->i_size); 470 inode->i_ino, (long long)hip->phys_size, inode->i_size);
446 if (inode->i_size > HFSPLUS_I(inode).phys_size) { 471
472 if (inode->i_size > hip->phys_size) {
447 struct address_space *mapping = inode->i_mapping; 473 struct address_space *mapping = inode->i_mapping;
448 struct page *page; 474 struct page *page;
449 void *fsdata; 475 void *fsdata;
@@ -460,47 +486,48 @@ void hfsplus_file_truncate(struct inode *inode)
460 return; 486 return;
461 mark_inode_dirty(inode); 487 mark_inode_dirty(inode);
462 return; 488 return;
463 } else if (inode->i_size == HFSPLUS_I(inode).phys_size) 489 } else if (inode->i_size == hip->phys_size)
464 return; 490 return;
465 491
466 blk_cnt = (inode->i_size + HFSPLUS_SB(sb).alloc_blksz - 1) >> HFSPLUS_SB(sb).alloc_blksz_shift; 492 blk_cnt = (inode->i_size + HFSPLUS_SB(sb)->alloc_blksz - 1) >>
467 alloc_cnt = HFSPLUS_I(inode).alloc_blocks; 493 HFSPLUS_SB(sb)->alloc_blksz_shift;
494 alloc_cnt = hip->alloc_blocks;
468 if (blk_cnt == alloc_cnt) 495 if (blk_cnt == alloc_cnt)
469 goto out; 496 goto out;
470 497
471 mutex_lock(&HFSPLUS_I(inode).extents_lock); 498 mutex_lock(&hip->extents_lock);
472 hfs_find_init(HFSPLUS_SB(sb).ext_tree, &fd); 499 hfs_find_init(HFSPLUS_SB(sb)->ext_tree, &fd);
473 while (1) { 500 while (1) {
474 if (alloc_cnt == HFSPLUS_I(inode).first_blocks) { 501 if (alloc_cnt == hip->first_blocks) {
475 hfsplus_free_extents(sb, HFSPLUS_I(inode).first_extents, 502 hfsplus_free_extents(sb, hip->first_extents,
476 alloc_cnt, alloc_cnt - blk_cnt); 503 alloc_cnt, alloc_cnt - blk_cnt);
477 hfsplus_dump_extent(HFSPLUS_I(inode).first_extents); 504 hfsplus_dump_extent(hip->first_extents);
478 HFSPLUS_I(inode).first_blocks = blk_cnt; 505 hip->first_blocks = blk_cnt;
479 break; 506 break;
480 } 507 }
481 res = __hfsplus_ext_cache_extent(&fd, inode, alloc_cnt); 508 res = __hfsplus_ext_cache_extent(&fd, inode, alloc_cnt);
482 if (res) 509 if (res)
483 break; 510 break;
484 start = HFSPLUS_I(inode).cached_start; 511 start = hip->cached_start;
485 hfsplus_free_extents(sb, HFSPLUS_I(inode).cached_extents, 512 hfsplus_free_extents(sb, hip->cached_extents,
486 alloc_cnt - start, alloc_cnt - blk_cnt); 513 alloc_cnt - start, alloc_cnt - blk_cnt);
487 hfsplus_dump_extent(HFSPLUS_I(inode).cached_extents); 514 hfsplus_dump_extent(hip->cached_extents);
488 if (blk_cnt > start) { 515 if (blk_cnt > start) {
489 HFSPLUS_I(inode).flags |= HFSPLUS_FLG_EXT_DIRTY; 516 hip->flags |= HFSPLUS_FLG_EXT_DIRTY;
490 break; 517 break;
491 } 518 }
492 alloc_cnt = start; 519 alloc_cnt = start;
493 HFSPLUS_I(inode).cached_start = HFSPLUS_I(inode).cached_blocks = 0; 520 hip->cached_start = hip->cached_blocks = 0;
494 HFSPLUS_I(inode).flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW); 521 hip->flags &= ~(HFSPLUS_FLG_EXT_DIRTY | HFSPLUS_FLG_EXT_NEW);
495 hfs_brec_remove(&fd); 522 hfs_brec_remove(&fd);
496 } 523 }
497 hfs_find_exit(&fd); 524 hfs_find_exit(&fd);
498 mutex_unlock(&HFSPLUS_I(inode).extents_lock); 525 mutex_unlock(&hip->extents_lock);
499 526
500 HFSPLUS_I(inode).alloc_blocks = blk_cnt; 527 hip->alloc_blocks = blk_cnt;
501out: 528out:
502 HFSPLUS_I(inode).phys_size = inode->i_size; 529 hip->phys_size = inode->i_size;
503 HFSPLUS_I(inode).fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; 530 hip->fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
504 inode_set_bytes(inode, HFSPLUS_I(inode).fs_blocks << sb->s_blocksize_bits); 531 inode_set_bytes(inode, hip->fs_blocks << sb->s_blocksize_bits);
505 mark_inode_dirty(inode); 532 mark_inode_dirty(inode);
506} 533}
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index dc856be3c2b0..cb3653efb57a 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -62,7 +62,7 @@ struct hfs_btree {
62 unsigned int depth; 62 unsigned int depth;
63 63
64 //unsigned int map1_size, map_size; 64 //unsigned int map1_size, map_size;
65 struct semaphore tree_lock; 65 struct mutex tree_lock;
66 66
67 unsigned int pages_per_bnode; 67 unsigned int pages_per_bnode;
68 spinlock_t hash_lock; 68 spinlock_t hash_lock;
@@ -121,16 +121,21 @@ struct hfsplus_sb_info {
121 u32 sect_count; 121 u32 sect_count;
122 int fs_shift; 122 int fs_shift;
123 123
124 /* Stuff in host order from Vol Header */ 124 /* immutable data from the volume header */
125 u32 alloc_blksz; 125 u32 alloc_blksz;
126 int alloc_blksz_shift; 126 int alloc_blksz_shift;
127 u32 total_blocks; 127 u32 total_blocks;
128 u32 data_clump_blocks, rsrc_clump_blocks;
129
130 /* mutable data from the volume header, protected by alloc_mutex */
128 u32 free_blocks; 131 u32 free_blocks;
129 u32 next_alloc; 132 struct mutex alloc_mutex;
133
134 /* mutable data from the volume header, protected by vh_mutex */
130 u32 next_cnid; 135 u32 next_cnid;
131 u32 file_count; 136 u32 file_count;
132 u32 folder_count; 137 u32 folder_count;
133 u32 data_clump_blocks, rsrc_clump_blocks; 138 struct mutex vh_mutex;
134 139
135 /* Config options */ 140 /* Config options */
136 u32 creator; 141 u32 creator;
@@ -143,40 +148,50 @@ struct hfsplus_sb_info {
143 int part, session; 148 int part, session;
144 149
145 unsigned long flags; 150 unsigned long flags;
146
147 struct hlist_head rsrc_inodes;
148}; 151};
149 152
150#define HFSPLUS_SB_WRITEBACKUP 0x0001 153#define HFSPLUS_SB_WRITEBACKUP 0
151#define HFSPLUS_SB_NODECOMPOSE 0x0002 154#define HFSPLUS_SB_NODECOMPOSE 1
152#define HFSPLUS_SB_FORCE 0x0004 155#define HFSPLUS_SB_FORCE 2
153#define HFSPLUS_SB_HFSX 0x0008 156#define HFSPLUS_SB_HFSX 3
154#define HFSPLUS_SB_CASEFOLD 0x0010 157#define HFSPLUS_SB_CASEFOLD 4
155 158
156 159
157struct hfsplus_inode_info { 160struct hfsplus_inode_info {
158 struct mutex extents_lock;
159 u32 clump_blocks, alloc_blocks;
160 sector_t fs_blocks;
161 /* Allocation extents from catalog record or volume header */
162 hfsplus_extent_rec first_extents;
163 u32 first_blocks;
164 hfsplus_extent_rec cached_extents;
165 u32 cached_start, cached_blocks;
166 atomic_t opencnt; 161 atomic_t opencnt;
167 162
168 struct inode *rsrc_inode; 163 /*
164 * Extent allocation information, protected by extents_lock.
165 */
166 u32 first_blocks;
167 u32 clump_blocks;
168 u32 alloc_blocks;
169 u32 cached_start;
170 u32 cached_blocks;
171 hfsplus_extent_rec first_extents;
172 hfsplus_extent_rec cached_extents;
169 unsigned long flags; 173 unsigned long flags;
174 struct mutex extents_lock;
170 175
176 /*
177 * Immutable data.
178 */
179 struct inode *rsrc_inode;
171 __be32 create_date; 180 __be32 create_date;
172 /* Device number in hfsplus_permissions in catalog */
173 u32 dev;
174 /* BSD system and user file flags */
175 u8 rootflags;
176 u8 userflags;
177 181
182 /*
183 * Protected by sbi->vh_mutex.
184 */
185 u32 linkid;
186
187 /*
188 * Protected by i_mutex.
189 */
190 sector_t fs_blocks;
191 u8 userflags; /* BSD user file flags */
178 struct list_head open_dir_list; 192 struct list_head open_dir_list;
179 loff_t phys_size; 193 loff_t phys_size;
194
180 struct inode vfs_inode; 195 struct inode vfs_inode;
181}; 196};
182 197
@@ -184,8 +199,8 @@ struct hfsplus_inode_info {
184#define HFSPLUS_FLG_EXT_DIRTY 0x0002 199#define HFSPLUS_FLG_EXT_DIRTY 0x0002
185#define HFSPLUS_FLG_EXT_NEW 0x0004 200#define HFSPLUS_FLG_EXT_NEW 0x0004
186 201
187#define HFSPLUS_IS_DATA(inode) (!(HFSPLUS_I(inode).flags & HFSPLUS_FLG_RSRC)) 202#define HFSPLUS_IS_DATA(inode) (!(HFSPLUS_I(inode)->flags & HFSPLUS_FLG_RSRC))
188#define HFSPLUS_IS_RSRC(inode) (HFSPLUS_I(inode).flags & HFSPLUS_FLG_RSRC) 203#define HFSPLUS_IS_RSRC(inode) (HFSPLUS_I(inode)->flags & HFSPLUS_FLG_RSRC)
189 204
190struct hfs_find_data { 205struct hfs_find_data {
191 /* filled by caller */ 206 /* filled by caller */
@@ -311,6 +326,7 @@ int hfsplus_create_cat(u32, struct inode *, struct qstr *, struct inode *);
311int hfsplus_delete_cat(u32, struct inode *, struct qstr *); 326int hfsplus_delete_cat(u32, struct inode *, struct qstr *);
312int hfsplus_rename_cat(u32, struct inode *, struct qstr *, 327int hfsplus_rename_cat(u32, struct inode *, struct qstr *,
313 struct inode *, struct qstr *); 328 struct inode *, struct qstr *);
329void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms);
314 330
315/* dir.c */ 331/* dir.c */
316extern const struct inode_operations hfsplus_dir_inode_operations; 332extern const struct inode_operations hfsplus_dir_inode_operations;
@@ -372,26 +388,15 @@ int hfsplus_read_wrapper(struct super_block *);
372int hfs_part_find(struct super_block *, sector_t *, sector_t *); 388int hfs_part_find(struct super_block *, sector_t *, sector_t *);
373 389
374/* access macros */ 390/* access macros */
375/*
376static inline struct hfsplus_sb_info *HFSPLUS_SB(struct super_block *sb) 391static inline struct hfsplus_sb_info *HFSPLUS_SB(struct super_block *sb)
377{ 392{
378 return sb->s_fs_info; 393 return sb->s_fs_info;
379} 394}
395
380static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode) 396static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode)
381{ 397{
382 return list_entry(inode, struct hfsplus_inode_info, vfs_inode); 398 return list_entry(inode, struct hfsplus_inode_info, vfs_inode);
383} 399}
384*/
385#define HFSPLUS_SB(super) (*(struct hfsplus_sb_info *)(super)->s_fs_info)
386#define HFSPLUS_I(inode) (*list_entry(inode, struct hfsplus_inode_info, vfs_inode))
387
388#if 1
389#define hfsplus_kmap(p) ({ struct page *__p = (p); kmap(__p); })
390#define hfsplus_kunmap(p) ({ struct page *__p = (p); kunmap(__p); __p; })
391#else
392#define hfsplus_kmap(p) kmap(p)
393#define hfsplus_kunmap(p) kunmap(p)
394#endif
395 400
396#define sb_bread512(sb, sec, data) ({ \ 401#define sb_bread512(sb, sec, data) ({ \
397 struct buffer_head *__bh; \ 402 struct buffer_head *__bh; \
@@ -419,6 +424,4 @@ static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode)
419#define hfsp_ut2mt(t) __hfsp_ut2mt((t).tv_sec) 424#define hfsp_ut2mt(t) __hfsp_ut2mt((t).tv_sec)
420#define hfsp_now2mt() __hfsp_ut2mt(get_seconds()) 425#define hfsp_now2mt() __hfsp_ut2mt(get_seconds())
421 426
422#define kdev_t_to_nr(x) (x)
423
424#endif 427#endif
diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h
index fe99fe8db61a..6892899fd6fb 100644
--- a/fs/hfsplus/hfsplus_raw.h
+++ b/fs/hfsplus/hfsplus_raw.h
@@ -200,6 +200,7 @@ struct hfsplus_cat_key {
200 struct hfsplus_unistr name; 200 struct hfsplus_unistr name;
201} __packed; 201} __packed;
202 202
203#define HFSPLUS_CAT_KEYLEN (sizeof(struct hfsplus_cat_key))
203 204
204/* Structs from hfs.h */ 205/* Structs from hfs.h */
205struct hfsp_point { 206struct hfsp_point {
@@ -323,7 +324,7 @@ struct hfsplus_ext_key {
323 __be32 start_block; 324 __be32 start_block;
324} __packed; 325} __packed;
325 326
326#define HFSPLUS_EXT_KEYLEN 12 327#define HFSPLUS_EXT_KEYLEN sizeof(struct hfsplus_ext_key)
327 328
328/* HFS+ generic BTree key */ 329/* HFS+ generic BTree key */
329typedef union { 330typedef union {
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index c5a979d62c65..78449280dae0 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -36,7 +36,7 @@ static int hfsplus_write_begin(struct file *file, struct address_space *mapping,
36 *pagep = NULL; 36 *pagep = NULL;
37 ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, 37 ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
38 hfsplus_get_block, 38 hfsplus_get_block,
39 &HFSPLUS_I(mapping->host).phys_size); 39 &HFSPLUS_I(mapping->host)->phys_size);
40 if (unlikely(ret)) { 40 if (unlikely(ret)) {
41 loff_t isize = mapping->host->i_size; 41 loff_t isize = mapping->host->i_size;
42 if (pos + len > isize) 42 if (pos + len > isize)
@@ -62,13 +62,13 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask)
62 62
63 switch (inode->i_ino) { 63 switch (inode->i_ino) {
64 case HFSPLUS_EXT_CNID: 64 case HFSPLUS_EXT_CNID:
65 tree = HFSPLUS_SB(sb).ext_tree; 65 tree = HFSPLUS_SB(sb)->ext_tree;
66 break; 66 break;
67 case HFSPLUS_CAT_CNID: 67 case HFSPLUS_CAT_CNID:
68 tree = HFSPLUS_SB(sb).cat_tree; 68 tree = HFSPLUS_SB(sb)->cat_tree;
69 break; 69 break;
70 case HFSPLUS_ATTR_CNID: 70 case HFSPLUS_ATTR_CNID:
71 tree = HFSPLUS_SB(sb).attr_tree; 71 tree = HFSPLUS_SB(sb)->attr_tree;
72 break; 72 break;
73 default: 73 default:
74 BUG(); 74 BUG();
@@ -172,12 +172,13 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
172 struct hfs_find_data fd; 172 struct hfs_find_data fd;
173 struct super_block *sb = dir->i_sb; 173 struct super_block *sb = dir->i_sb;
174 struct inode *inode = NULL; 174 struct inode *inode = NULL;
175 struct hfsplus_inode_info *hip;
175 int err; 176 int err;
176 177
177 if (HFSPLUS_IS_RSRC(dir) || strcmp(dentry->d_name.name, "rsrc")) 178 if (HFSPLUS_IS_RSRC(dir) || strcmp(dentry->d_name.name, "rsrc"))
178 goto out; 179 goto out;
179 180
180 inode = HFSPLUS_I(dir).rsrc_inode; 181 inode = HFSPLUS_I(dir)->rsrc_inode;
181 if (inode) 182 if (inode)
182 goto out; 183 goto out;
183 184
@@ -185,12 +186,13 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
185 if (!inode) 186 if (!inode)
186 return ERR_PTR(-ENOMEM); 187 return ERR_PTR(-ENOMEM);
187 188
189 hip = HFSPLUS_I(inode);
188 inode->i_ino = dir->i_ino; 190 inode->i_ino = dir->i_ino;
189 INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list); 191 INIT_LIST_HEAD(&hip->open_dir_list);
190 mutex_init(&HFSPLUS_I(inode).extents_lock); 192 mutex_init(&hip->extents_lock);
191 HFSPLUS_I(inode).flags = HFSPLUS_FLG_RSRC; 193 hip->flags = HFSPLUS_FLG_RSRC;
192 194
193 hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd); 195 hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd);
194 err = hfsplus_find_cat(sb, dir->i_ino, &fd); 196 err = hfsplus_find_cat(sb, dir->i_ino, &fd);
195 if (!err) 197 if (!err)
196 err = hfsplus_cat_read_inode(inode, &fd); 198 err = hfsplus_cat_read_inode(inode, &fd);
@@ -199,10 +201,18 @@ static struct dentry *hfsplus_file_lookup(struct inode *dir, struct dentry *dent
199 iput(inode); 201 iput(inode);
200 return ERR_PTR(err); 202 return ERR_PTR(err);
201 } 203 }
202 HFSPLUS_I(inode).rsrc_inode = dir; 204 hip->rsrc_inode = dir;
203 HFSPLUS_I(dir).rsrc_inode = inode; 205 HFSPLUS_I(dir)->rsrc_inode = inode;
204 igrab(dir); 206 igrab(dir);
205 hlist_add_head(&inode->i_hash, &HFSPLUS_SB(sb).rsrc_inodes); 207
208 /*
209 * __mark_inode_dirty expects inodes to be hashed. Since we don't
210 * want resource fork inodes in the regular inode space, we make them
211 * appear hashed, but do not put on any lists. hlist_del()
212 * will work fine and require no locking.
213 */
214 inode->i_hash.pprev = &inode->i_hash.next;
215
206 mark_inode_dirty(inode); 216 mark_inode_dirty(inode);
207out: 217out:
208 d_add(dentry, inode); 218 d_add(dentry, inode);
@@ -211,30 +221,27 @@ out:
211 221
212static void hfsplus_get_perms(struct inode *inode, struct hfsplus_perm *perms, int dir) 222static void hfsplus_get_perms(struct inode *inode, struct hfsplus_perm *perms, int dir)
213{ 223{
214 struct super_block *sb = inode->i_sb; 224 struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
215 u16 mode; 225 u16 mode;
216 226
217 mode = be16_to_cpu(perms->mode); 227 mode = be16_to_cpu(perms->mode);
218 228
219 inode->i_uid = be32_to_cpu(perms->owner); 229 inode->i_uid = be32_to_cpu(perms->owner);
220 if (!inode->i_uid && !mode) 230 if (!inode->i_uid && !mode)
221 inode->i_uid = HFSPLUS_SB(sb).uid; 231 inode->i_uid = sbi->uid;
222 232
223 inode->i_gid = be32_to_cpu(perms->group); 233 inode->i_gid = be32_to_cpu(perms->group);
224 if (!inode->i_gid && !mode) 234 if (!inode->i_gid && !mode)
225 inode->i_gid = HFSPLUS_SB(sb).gid; 235 inode->i_gid = sbi->gid;
226 236
227 if (dir) { 237 if (dir) {
228 mode = mode ? (mode & S_IALLUGO) : 238 mode = mode ? (mode & S_IALLUGO) : (S_IRWXUGO & ~(sbi->umask));
229 (S_IRWXUGO & ~(HFSPLUS_SB(sb).umask));
230 mode |= S_IFDIR; 239 mode |= S_IFDIR;
231 } else if (!mode) 240 } else if (!mode)
232 mode = S_IFREG | ((S_IRUGO|S_IWUGO) & 241 mode = S_IFREG | ((S_IRUGO|S_IWUGO) & ~(sbi->umask));
233 ~(HFSPLUS_SB(sb).umask));
234 inode->i_mode = mode; 242 inode->i_mode = mode;
235 243
236 HFSPLUS_I(inode).rootflags = perms->rootflags; 244 HFSPLUS_I(inode)->userflags = perms->userflags;
237 HFSPLUS_I(inode).userflags = perms->userflags;
238 if (perms->rootflags & HFSPLUS_FLG_IMMUTABLE) 245 if (perms->rootflags & HFSPLUS_FLG_IMMUTABLE)
239 inode->i_flags |= S_IMMUTABLE; 246 inode->i_flags |= S_IMMUTABLE;
240 else 247 else
@@ -245,30 +252,13 @@ static void hfsplus_get_perms(struct inode *inode, struct hfsplus_perm *perms, i
245 inode->i_flags &= ~S_APPEND; 252 inode->i_flags &= ~S_APPEND;
246} 253}
247 254
248static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm *perms)
249{
250 if (inode->i_flags & S_IMMUTABLE)
251 perms->rootflags |= HFSPLUS_FLG_IMMUTABLE;
252 else
253 perms->rootflags &= ~HFSPLUS_FLG_IMMUTABLE;
254 if (inode->i_flags & S_APPEND)
255 perms->rootflags |= HFSPLUS_FLG_APPEND;
256 else
257 perms->rootflags &= ~HFSPLUS_FLG_APPEND;
258 perms->userflags = HFSPLUS_I(inode).userflags;
259 perms->mode = cpu_to_be16(inode->i_mode);
260 perms->owner = cpu_to_be32(inode->i_uid);
261 perms->group = cpu_to_be32(inode->i_gid);
262 perms->dev = cpu_to_be32(HFSPLUS_I(inode).dev);
263}
264
265static int hfsplus_file_open(struct inode *inode, struct file *file) 255static int hfsplus_file_open(struct inode *inode, struct file *file)
266{ 256{
267 if (HFSPLUS_IS_RSRC(inode)) 257 if (HFSPLUS_IS_RSRC(inode))
268 inode = HFSPLUS_I(inode).rsrc_inode; 258 inode = HFSPLUS_I(inode)->rsrc_inode;
269 if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) 259 if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
270 return -EOVERFLOW; 260 return -EOVERFLOW;
271 atomic_inc(&HFSPLUS_I(inode).opencnt); 261 atomic_inc(&HFSPLUS_I(inode)->opencnt);
272 return 0; 262 return 0;
273} 263}
274 264
@@ -277,12 +267,13 @@ static int hfsplus_file_release(struct inode *inode, struct file *file)
277 struct super_block *sb = inode->i_sb; 267 struct super_block *sb = inode->i_sb;
278 268
279 if (HFSPLUS_IS_RSRC(inode)) 269 if (HFSPLUS_IS_RSRC(inode))
280 inode = HFSPLUS_I(inode).rsrc_inode; 270 inode = HFSPLUS_I(inode)->rsrc_inode;
281 if (atomic_dec_and_test(&HFSPLUS_I(inode).opencnt)) { 271 if (atomic_dec_and_test(&HFSPLUS_I(inode)->opencnt)) {
282 mutex_lock(&inode->i_mutex); 272 mutex_lock(&inode->i_mutex);
283 hfsplus_file_truncate(inode); 273 hfsplus_file_truncate(inode);
284 if (inode->i_flags & S_DEAD) { 274 if (inode->i_flags & S_DEAD) {
285 hfsplus_delete_cat(inode->i_ino, HFSPLUS_SB(sb).hidden_dir, NULL); 275 hfsplus_delete_cat(inode->i_ino,
276 HFSPLUS_SB(sb)->hidden_dir, NULL);
286 hfsplus_delete_inode(inode); 277 hfsplus_delete_inode(inode);
287 } 278 }
288 mutex_unlock(&inode->i_mutex); 279 mutex_unlock(&inode->i_mutex);
@@ -361,47 +352,52 @@ static const struct file_operations hfsplus_file_operations = {
361 352
362struct inode *hfsplus_new_inode(struct super_block *sb, int mode) 353struct inode *hfsplus_new_inode(struct super_block *sb, int mode)
363{ 354{
355 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
364 struct inode *inode = new_inode(sb); 356 struct inode *inode = new_inode(sb);
357 struct hfsplus_inode_info *hip;
358
365 if (!inode) 359 if (!inode)
366 return NULL; 360 return NULL;
367 361
368 inode->i_ino = HFSPLUS_SB(sb).next_cnid++; 362 inode->i_ino = sbi->next_cnid++;
369 inode->i_mode = mode; 363 inode->i_mode = mode;
370 inode->i_uid = current_fsuid(); 364 inode->i_uid = current_fsuid();
371 inode->i_gid = current_fsgid(); 365 inode->i_gid = current_fsgid();
372 inode->i_nlink = 1; 366 inode->i_nlink = 1;
373 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; 367 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
374 INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list); 368
375 mutex_init(&HFSPLUS_I(inode).extents_lock); 369 hip = HFSPLUS_I(inode);
376 atomic_set(&HFSPLUS_I(inode).opencnt, 0); 370 INIT_LIST_HEAD(&hip->open_dir_list);
377 HFSPLUS_I(inode).flags = 0; 371 mutex_init(&hip->extents_lock);
378 memset(HFSPLUS_I(inode).first_extents, 0, sizeof(hfsplus_extent_rec)); 372 atomic_set(&hip->opencnt, 0);
379 memset(HFSPLUS_I(inode).cached_extents, 0, sizeof(hfsplus_extent_rec)); 373 hip->flags = 0;
380 HFSPLUS_I(inode).alloc_blocks = 0; 374 memset(hip->first_extents, 0, sizeof(hfsplus_extent_rec));
381 HFSPLUS_I(inode).first_blocks = 0; 375 memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
382 HFSPLUS_I(inode).cached_start = 0; 376 hip->alloc_blocks = 0;
383 HFSPLUS_I(inode).cached_blocks = 0; 377 hip->first_blocks = 0;
384 HFSPLUS_I(inode).phys_size = 0; 378 hip->cached_start = 0;
385 HFSPLUS_I(inode).fs_blocks = 0; 379 hip->cached_blocks = 0;
386 HFSPLUS_I(inode).rsrc_inode = NULL; 380 hip->phys_size = 0;
381 hip->fs_blocks = 0;
382 hip->rsrc_inode = NULL;
387 if (S_ISDIR(inode->i_mode)) { 383 if (S_ISDIR(inode->i_mode)) {
388 inode->i_size = 2; 384 inode->i_size = 2;
389 HFSPLUS_SB(sb).folder_count++; 385 sbi->folder_count++;
390 inode->i_op = &hfsplus_dir_inode_operations; 386 inode->i_op = &hfsplus_dir_inode_operations;
391 inode->i_fop = &hfsplus_dir_operations; 387 inode->i_fop = &hfsplus_dir_operations;
392 } else if (S_ISREG(inode->i_mode)) { 388 } else if (S_ISREG(inode->i_mode)) {
393 HFSPLUS_SB(sb).file_count++; 389 sbi->file_count++;
394 inode->i_op = &hfsplus_file_inode_operations; 390 inode->i_op = &hfsplus_file_inode_operations;
395 inode->i_fop = &hfsplus_file_operations; 391 inode->i_fop = &hfsplus_file_operations;
396 inode->i_mapping->a_ops = &hfsplus_aops; 392 inode->i_mapping->a_ops = &hfsplus_aops;
397 HFSPLUS_I(inode).clump_blocks = HFSPLUS_SB(sb).data_clump_blocks; 393 hip->clump_blocks = sbi->data_clump_blocks;
398 } else if (S_ISLNK(inode->i_mode)) { 394 } else if (S_ISLNK(inode->i_mode)) {
399 HFSPLUS_SB(sb).file_count++; 395 sbi->file_count++;
400 inode->i_op = &page_symlink_inode_operations; 396 inode->i_op = &page_symlink_inode_operations;
401 inode->i_mapping->a_ops = &hfsplus_aops; 397 inode->i_mapping->a_ops = &hfsplus_aops;
402 HFSPLUS_I(inode).clump_blocks = 1; 398 hip->clump_blocks = 1;
403 } else 399 } else
404 HFSPLUS_SB(sb).file_count++; 400 sbi->file_count++;
405 insert_inode_hash(inode); 401 insert_inode_hash(inode);
406 mark_inode_dirty(inode); 402 mark_inode_dirty(inode);
407 sb->s_dirt = 1; 403 sb->s_dirt = 1;
@@ -414,11 +410,11 @@ void hfsplus_delete_inode(struct inode *inode)
414 struct super_block *sb = inode->i_sb; 410 struct super_block *sb = inode->i_sb;
415 411
416 if (S_ISDIR(inode->i_mode)) { 412 if (S_ISDIR(inode->i_mode)) {
417 HFSPLUS_SB(sb).folder_count--; 413 HFSPLUS_SB(sb)->folder_count--;
418 sb->s_dirt = 1; 414 sb->s_dirt = 1;
419 return; 415 return;
420 } 416 }
421 HFSPLUS_SB(sb).file_count--; 417 HFSPLUS_SB(sb)->file_count--;
422 if (S_ISREG(inode->i_mode)) { 418 if (S_ISREG(inode->i_mode)) {
423 if (!inode->i_nlink) { 419 if (!inode->i_nlink) {
424 inode->i_size = 0; 420 inode->i_size = 0;
@@ -434,34 +430,39 @@ void hfsplus_delete_inode(struct inode *inode)
434void hfsplus_inode_read_fork(struct inode *inode, struct hfsplus_fork_raw *fork) 430void hfsplus_inode_read_fork(struct inode *inode, struct hfsplus_fork_raw *fork)
435{ 431{
436 struct super_block *sb = inode->i_sb; 432 struct super_block *sb = inode->i_sb;
433 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
434 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
437 u32 count; 435 u32 count;
438 int i; 436 int i;
439 437
440 memcpy(&HFSPLUS_I(inode).first_extents, &fork->extents, 438 memcpy(&hip->first_extents, &fork->extents, sizeof(hfsplus_extent_rec));
441 sizeof(hfsplus_extent_rec));
442 for (count = 0, i = 0; i < 8; i++) 439 for (count = 0, i = 0; i < 8; i++)
443 count += be32_to_cpu(fork->extents[i].block_count); 440 count += be32_to_cpu(fork->extents[i].block_count);
444 HFSPLUS_I(inode).first_blocks = count; 441 hip->first_blocks = count;
445 memset(HFSPLUS_I(inode).cached_extents, 0, sizeof(hfsplus_extent_rec)); 442 memset(hip->cached_extents, 0, sizeof(hfsplus_extent_rec));
446 HFSPLUS_I(inode).cached_start = 0; 443 hip->cached_start = 0;
447 HFSPLUS_I(inode).cached_blocks = 0; 444 hip->cached_blocks = 0;
448 445
449 HFSPLUS_I(inode).alloc_blocks = be32_to_cpu(fork->total_blocks); 446 hip->alloc_blocks = be32_to_cpu(fork->total_blocks);
450 inode->i_size = HFSPLUS_I(inode).phys_size = be64_to_cpu(fork->total_size); 447 hip->phys_size = inode->i_size = be64_to_cpu(fork->total_size);
451 HFSPLUS_I(inode).fs_blocks = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; 448 hip->fs_blocks =
452 inode_set_bytes(inode, HFSPLUS_I(inode).fs_blocks << sb->s_blocksize_bits); 449 (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
453 HFSPLUS_I(inode).clump_blocks = be32_to_cpu(fork->clump_size) >> HFSPLUS_SB(sb).alloc_blksz_shift; 450 inode_set_bytes(inode, hip->fs_blocks << sb->s_blocksize_bits);
454 if (!HFSPLUS_I(inode).clump_blocks) 451 hip->clump_blocks =
455 HFSPLUS_I(inode).clump_blocks = HFSPLUS_IS_RSRC(inode) ? HFSPLUS_SB(sb).rsrc_clump_blocks : 452 be32_to_cpu(fork->clump_size) >> sbi->alloc_blksz_shift;
456 HFSPLUS_SB(sb).data_clump_blocks; 453 if (!hip->clump_blocks) {
454 hip->clump_blocks = HFSPLUS_IS_RSRC(inode) ?
455 sbi->rsrc_clump_blocks :
456 sbi->data_clump_blocks;
457 }
457} 458}
458 459
459void hfsplus_inode_write_fork(struct inode *inode, struct hfsplus_fork_raw *fork) 460void hfsplus_inode_write_fork(struct inode *inode, struct hfsplus_fork_raw *fork)
460{ 461{
461 memcpy(&fork->extents, &HFSPLUS_I(inode).first_extents, 462 memcpy(&fork->extents, &HFSPLUS_I(inode)->first_extents,
462 sizeof(hfsplus_extent_rec)); 463 sizeof(hfsplus_extent_rec));
463 fork->total_size = cpu_to_be64(inode->i_size); 464 fork->total_size = cpu_to_be64(inode->i_size);
464 fork->total_blocks = cpu_to_be32(HFSPLUS_I(inode).alloc_blocks); 465 fork->total_blocks = cpu_to_be32(HFSPLUS_I(inode)->alloc_blocks);
465} 466}
466 467
467int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) 468int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
@@ -472,7 +473,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
472 473
473 type = hfs_bnode_read_u16(fd->bnode, fd->entryoffset); 474 type = hfs_bnode_read_u16(fd->bnode, fd->entryoffset);
474 475
475 HFSPLUS_I(inode).dev = 0; 476 HFSPLUS_I(inode)->linkid = 0;
476 if (type == HFSPLUS_FOLDER) { 477 if (type == HFSPLUS_FOLDER) {
477 struct hfsplus_cat_folder *folder = &entry.folder; 478 struct hfsplus_cat_folder *folder = &entry.folder;
478 479
@@ -486,8 +487,8 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
486 inode->i_atime = hfsp_mt2ut(folder->access_date); 487 inode->i_atime = hfsp_mt2ut(folder->access_date);
487 inode->i_mtime = hfsp_mt2ut(folder->content_mod_date); 488 inode->i_mtime = hfsp_mt2ut(folder->content_mod_date);
488 inode->i_ctime = hfsp_mt2ut(folder->attribute_mod_date); 489 inode->i_ctime = hfsp_mt2ut(folder->attribute_mod_date);
489 HFSPLUS_I(inode).create_date = folder->create_date; 490 HFSPLUS_I(inode)->create_date = folder->create_date;
490 HFSPLUS_I(inode).fs_blocks = 0; 491 HFSPLUS_I(inode)->fs_blocks = 0;
491 inode->i_op = &hfsplus_dir_inode_operations; 492 inode->i_op = &hfsplus_dir_inode_operations;
492 inode->i_fop = &hfsplus_dir_operations; 493 inode->i_fop = &hfsplus_dir_operations;
493 } else if (type == HFSPLUS_FILE) { 494 } else if (type == HFSPLUS_FILE) {
@@ -518,7 +519,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
518 inode->i_atime = hfsp_mt2ut(file->access_date); 519 inode->i_atime = hfsp_mt2ut(file->access_date);
519 inode->i_mtime = hfsp_mt2ut(file->content_mod_date); 520 inode->i_mtime = hfsp_mt2ut(file->content_mod_date);
520 inode->i_ctime = hfsp_mt2ut(file->attribute_mod_date); 521 inode->i_ctime = hfsp_mt2ut(file->attribute_mod_date);
521 HFSPLUS_I(inode).create_date = file->create_date; 522 HFSPLUS_I(inode)->create_date = file->create_date;
522 } else { 523 } else {
523 printk(KERN_ERR "hfs: bad catalog entry used to create inode\n"); 524 printk(KERN_ERR "hfs: bad catalog entry used to create inode\n");
524 res = -EIO; 525 res = -EIO;
@@ -533,12 +534,12 @@ int hfsplus_cat_write_inode(struct inode *inode)
533 hfsplus_cat_entry entry; 534 hfsplus_cat_entry entry;
534 535
535 if (HFSPLUS_IS_RSRC(inode)) 536 if (HFSPLUS_IS_RSRC(inode))
536 main_inode = HFSPLUS_I(inode).rsrc_inode; 537 main_inode = HFSPLUS_I(inode)->rsrc_inode;
537 538
538 if (!main_inode->i_nlink) 539 if (!main_inode->i_nlink)
539 return 0; 540 return 0;
540 541
541 if (hfs_find_init(HFSPLUS_SB(main_inode->i_sb).cat_tree, &fd)) 542 if (hfs_find_init(HFSPLUS_SB(main_inode->i_sb)->cat_tree, &fd))
542 /* panic? */ 543 /* panic? */
543 return -EIO; 544 return -EIO;
544 545
@@ -554,7 +555,7 @@ int hfsplus_cat_write_inode(struct inode *inode)
554 hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, 555 hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
555 sizeof(struct hfsplus_cat_folder)); 556 sizeof(struct hfsplus_cat_folder));
556 /* simple node checks? */ 557 /* simple node checks? */
557 hfsplus_set_perms(inode, &folder->permissions); 558 hfsplus_cat_set_perms(inode, &folder->permissions);
558 folder->access_date = hfsp_ut2mt(inode->i_atime); 559 folder->access_date = hfsp_ut2mt(inode->i_atime);
559 folder->content_mod_date = hfsp_ut2mt(inode->i_mtime); 560 folder->content_mod_date = hfsp_ut2mt(inode->i_mtime);
560 folder->attribute_mod_date = hfsp_ut2mt(inode->i_ctime); 561 folder->attribute_mod_date = hfsp_ut2mt(inode->i_ctime);
@@ -576,11 +577,7 @@ int hfsplus_cat_write_inode(struct inode *inode)
576 hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, 577 hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
577 sizeof(struct hfsplus_cat_file)); 578 sizeof(struct hfsplus_cat_file));
578 hfsplus_inode_write_fork(inode, &file->data_fork); 579 hfsplus_inode_write_fork(inode, &file->data_fork);
579 if (S_ISREG(inode->i_mode)) 580 hfsplus_cat_set_perms(inode, &file->permissions);
580 HFSPLUS_I(inode).dev = inode->i_nlink;
581 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
582 HFSPLUS_I(inode).dev = kdev_t_to_nr(inode->i_rdev);
583 hfsplus_set_perms(inode, &file->permissions);
584 if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE) 581 if ((file->permissions.rootflags | file->permissions.userflags) & HFSPLUS_FLG_IMMUTABLE)
585 file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED); 582 file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED);
586 else 583 else
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index ac405f099026..5b4667e08ef7 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -17,83 +17,98 @@
17#include <linux/mount.h> 17#include <linux/mount.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/xattr.h> 19#include <linux/xattr.h>
20#include <linux/smp_lock.h>
21#include <asm/uaccess.h> 20#include <asm/uaccess.h>
22#include "hfsplus_fs.h" 21#include "hfsplus_fs.h"
23 22
24long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) 23static int hfsplus_ioctl_getflags(struct file *file, int __user *user_flags)
25{ 24{
26 struct inode *inode = filp->f_path.dentry->d_inode; 25 struct inode *inode = file->f_path.dentry->d_inode;
26 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
27 unsigned int flags = 0;
28
29 if (inode->i_flags & S_IMMUTABLE)
30 flags |= FS_IMMUTABLE_FL;
31 if (inode->i_flags |= S_APPEND)
32 flags |= FS_APPEND_FL;
33 if (hip->userflags & HFSPLUS_FLG_NODUMP)
34 flags |= FS_NODUMP_FL;
35
36 return put_user(flags, user_flags);
37}
38
39static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
40{
41 struct inode *inode = file->f_path.dentry->d_inode;
42 struct hfsplus_inode_info *hip = HFSPLUS_I(inode);
27 unsigned int flags; 43 unsigned int flags;
44 int err = 0;
28 45
29 lock_kernel(); 46 err = mnt_want_write(file->f_path.mnt);
30 switch (cmd) { 47 if (err)
31 case HFSPLUS_IOC_EXT2_GETFLAGS: 48 goto out;
32 flags = 0;
33 if (HFSPLUS_I(inode).rootflags & HFSPLUS_FLG_IMMUTABLE)
34 flags |= FS_IMMUTABLE_FL; /* EXT2_IMMUTABLE_FL */
35 if (HFSPLUS_I(inode).rootflags & HFSPLUS_FLG_APPEND)
36 flags |= FS_APPEND_FL; /* EXT2_APPEND_FL */
37 if (HFSPLUS_I(inode).userflags & HFSPLUS_FLG_NODUMP)
38 flags |= FS_NODUMP_FL; /* EXT2_NODUMP_FL */
39 return put_user(flags, (int __user *)arg);
40 case HFSPLUS_IOC_EXT2_SETFLAGS: {
41 int err = 0;
42 err = mnt_want_write(filp->f_path.mnt);
43 if (err) {
44 unlock_kernel();
45 return err;
46 }
47 49
48 if (!is_owner_or_cap(inode)) { 50 if (!is_owner_or_cap(inode)) {
49 err = -EACCES; 51 err = -EACCES;
50 goto setflags_out; 52 goto out_drop_write;
51 } 53 }
52 if (get_user(flags, (int __user *)arg)) {
53 err = -EFAULT;
54 goto setflags_out;
55 }
56 if (flags & (FS_IMMUTABLE_FL|FS_APPEND_FL) ||
57 HFSPLUS_I(inode).rootflags & (HFSPLUS_FLG_IMMUTABLE|HFSPLUS_FLG_APPEND)) {
58 if (!capable(CAP_LINUX_IMMUTABLE)) {
59 err = -EPERM;
60 goto setflags_out;
61 }
62 }
63 54
64 /* don't silently ignore unsupported ext2 flags */ 55 if (get_user(flags, user_flags)) {
65 if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) { 56 err = -EFAULT;
66 err = -EOPNOTSUPP; 57 goto out_drop_write;
67 goto setflags_out; 58 }
68 } 59
69 if (flags & FS_IMMUTABLE_FL) { /* EXT2_IMMUTABLE_FL */ 60 mutex_lock(&inode->i_mutex);
70 inode->i_flags |= S_IMMUTABLE; 61
71 HFSPLUS_I(inode).rootflags |= HFSPLUS_FLG_IMMUTABLE; 62 if ((flags & (FS_IMMUTABLE_FL|FS_APPEND_FL)) ||
72 } else { 63 inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
73 inode->i_flags &= ~S_IMMUTABLE; 64 if (!capable(CAP_LINUX_IMMUTABLE)) {
74 HFSPLUS_I(inode).rootflags &= ~HFSPLUS_FLG_IMMUTABLE; 65 err = -EPERM;
75 } 66 goto out_unlock_inode;
76 if (flags & FS_APPEND_FL) { /* EXT2_APPEND_FL */
77 inode->i_flags |= S_APPEND;
78 HFSPLUS_I(inode).rootflags |= HFSPLUS_FLG_APPEND;
79 } else {
80 inode->i_flags &= ~S_APPEND;
81 HFSPLUS_I(inode).rootflags &= ~HFSPLUS_FLG_APPEND;
82 } 67 }
83 if (flags & FS_NODUMP_FL) /* EXT2_NODUMP_FL */
84 HFSPLUS_I(inode).userflags |= HFSPLUS_FLG_NODUMP;
85 else
86 HFSPLUS_I(inode).userflags &= ~HFSPLUS_FLG_NODUMP;
87
88 inode->i_ctime = CURRENT_TIME_SEC;
89 mark_inode_dirty(inode);
90setflags_out:
91 mnt_drop_write(filp->f_path.mnt);
92 unlock_kernel();
93 return err;
94 } 68 }
69
70 /* don't silently ignore unsupported ext2 flags */
71 if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) {
72 err = -EOPNOTSUPP;
73 goto out_unlock_inode;
74 }
75
76 if (flags & FS_IMMUTABLE_FL)
77 inode->i_flags |= S_IMMUTABLE;
78 else
79 inode->i_flags &= ~S_IMMUTABLE;
80
81 if (flags & FS_APPEND_FL)
82 inode->i_flags |= S_APPEND;
83 else
84 inode->i_flags &= ~S_APPEND;
85
86 if (flags & FS_NODUMP_FL)
87 hip->userflags |= HFSPLUS_FLG_NODUMP;
88 else
89 hip->userflags &= ~HFSPLUS_FLG_NODUMP;
90
91 inode->i_ctime = CURRENT_TIME_SEC;
92 mark_inode_dirty(inode);
93
94out_unlock_inode:
95 mutex_lock(&inode->i_mutex);
96out_drop_write:
97 mnt_drop_write(file->f_path.mnt);
98out:
99 return err;
100}
101
102long hfsplus_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
103{
104 void __user *argp = (void __user *)arg;
105
106 switch (cmd) {
107 case HFSPLUS_IOC_EXT2_GETFLAGS:
108 return hfsplus_ioctl_getflags(file, argp);
109 case HFSPLUS_IOC_EXT2_SETFLAGS:
110 return hfsplus_ioctl_setflags(file, argp);
95 default: 111 default:
96 unlock_kernel();
97 return -ENOTTY; 112 return -ENOTTY;
98 } 113 }
99} 114}
@@ -110,7 +125,7 @@ int hfsplus_setxattr(struct dentry *dentry, const char *name,
110 if (!S_ISREG(inode->i_mode) || HFSPLUS_IS_RSRC(inode)) 125 if (!S_ISREG(inode->i_mode) || HFSPLUS_IS_RSRC(inode))
111 return -EOPNOTSUPP; 126 return -EOPNOTSUPP;
112 127
113 res = hfs_find_init(HFSPLUS_SB(inode->i_sb).cat_tree, &fd); 128 res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
114 if (res) 129 if (res)
115 return res; 130 return res;
116 res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); 131 res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
@@ -153,7 +168,7 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
153 return -EOPNOTSUPP; 168 return -EOPNOTSUPP;
154 169
155 if (size) { 170 if (size) {
156 res = hfs_find_init(HFSPLUS_SB(inode->i_sb).cat_tree, &fd); 171 res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
157 if (res) 172 if (res)
158 return res; 173 return res;
159 res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); 174 res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
@@ -177,7 +192,7 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
177 } else 192 } else
178 res = size ? -ERANGE : 4; 193 res = size ? -ERANGE : 4;
179 } else 194 } else
180 res = -ENODATA; 195 res = -EOPNOTSUPP;
181out: 196out:
182 if (size) 197 if (size)
183 hfs_find_exit(&fd); 198 hfs_find_exit(&fd);
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index 572628b4b07d..f9ab276a4d8d 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -143,13 +143,13 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi)
143 kfree(p); 143 kfree(p);
144 break; 144 break;
145 case opt_decompose: 145 case opt_decompose:
146 sbi->flags &= ~HFSPLUS_SB_NODECOMPOSE; 146 clear_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags);
147 break; 147 break;
148 case opt_nodecompose: 148 case opt_nodecompose:
149 sbi->flags |= HFSPLUS_SB_NODECOMPOSE; 149 set_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags);
150 break; 150 break;
151 case opt_force: 151 case opt_force:
152 sbi->flags |= HFSPLUS_SB_FORCE; 152 set_bit(HFSPLUS_SB_FORCE, &sbi->flags);
153 break; 153 break;
154 default: 154 default:
155 return 0; 155 return 0;
@@ -171,7 +171,7 @@ done:
171 171
172int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt) 172int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt)
173{ 173{
174 struct hfsplus_sb_info *sbi = &HFSPLUS_SB(mnt->mnt_sb); 174 struct hfsplus_sb_info *sbi = HFSPLUS_SB(mnt->mnt_sb);
175 175
176 if (sbi->creator != HFSPLUS_DEF_CR_TYPE) 176 if (sbi->creator != HFSPLUS_DEF_CR_TYPE)
177 seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator); 177 seq_printf(seq, ",creator=%.4s", (char *)&sbi->creator);
@@ -184,7 +184,7 @@ int hfsplus_show_options(struct seq_file *seq, struct vfsmount *mnt)
184 seq_printf(seq, ",session=%u", sbi->session); 184 seq_printf(seq, ",session=%u", sbi->session);
185 if (sbi->nls) 185 if (sbi->nls)
186 seq_printf(seq, ",nls=%s", sbi->nls->charset); 186 seq_printf(seq, ",nls=%s", sbi->nls->charset);
187 if (sbi->flags & HFSPLUS_SB_NODECOMPOSE) 187 if (test_bit(HFSPLUS_SB_NODECOMPOSE, &sbi->flags))
188 seq_printf(seq, ",nodecompose"); 188 seq_printf(seq, ",nodecompose");
189 return 0; 189 return 0;
190} 190}
diff --git a/fs/hfsplus/part_tbl.c b/fs/hfsplus/part_tbl.c
index 1528a6fd0299..208b16c645cc 100644
--- a/fs/hfsplus/part_tbl.c
+++ b/fs/hfsplus/part_tbl.c
@@ -74,6 +74,7 @@ struct old_pmap {
74int hfs_part_find(struct super_block *sb, 74int hfs_part_find(struct super_block *sb,
75 sector_t *part_start, sector_t *part_size) 75 sector_t *part_start, sector_t *part_size)
76{ 76{
77 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
77 struct buffer_head *bh; 78 struct buffer_head *bh;
78 __be16 *data; 79 __be16 *data;
79 int i, size, res; 80 int i, size, res;
@@ -95,7 +96,7 @@ int hfs_part_find(struct super_block *sb,
95 for (i = 0; i < size; p++, i++) { 96 for (i = 0; i < size; p++, i++) {
96 if (p->pdStart && p->pdSize && 97 if (p->pdStart && p->pdSize &&
97 p->pdFSID == cpu_to_be32(0x54465331)/*"TFS1"*/ && 98 p->pdFSID == cpu_to_be32(0x54465331)/*"TFS1"*/ &&
98 (HFSPLUS_SB(sb).part < 0 || HFSPLUS_SB(sb).part == i)) { 99 (sbi->part < 0 || sbi->part == i)) {
99 *part_start += be32_to_cpu(p->pdStart); 100 *part_start += be32_to_cpu(p->pdStart);
100 *part_size = be32_to_cpu(p->pdSize); 101 *part_size = be32_to_cpu(p->pdSize);
101 res = 0; 102 res = 0;
@@ -111,7 +112,7 @@ int hfs_part_find(struct super_block *sb,
111 size = be32_to_cpu(pm->pmMapBlkCnt); 112 size = be32_to_cpu(pm->pmMapBlkCnt);
112 for (i = 0; i < size;) { 113 for (i = 0; i < size;) {
113 if (!memcmp(pm->pmPartType,"Apple_HFS", 9) && 114 if (!memcmp(pm->pmPartType,"Apple_HFS", 9) &&
114 (HFSPLUS_SB(sb).part < 0 || HFSPLUS_SB(sb).part == i)) { 115 (sbi->part < 0 || sbi->part == i)) {
115 *part_start += be32_to_cpu(pm->pmPyPartStart); 116 *part_start += be32_to_cpu(pm->pmPyPartStart);
116 *part_size = be32_to_cpu(pm->pmPartBlkCnt); 117 *part_size = be32_to_cpu(pm->pmPartBlkCnt);
117 res = 0; 118 res = 0;
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 3b55c050c742..9a88d7536103 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -12,7 +12,6 @@
12#include <linux/pagemap.h> 12#include <linux/pagemap.h>
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/smp_lock.h>
16#include <linux/vfs.h> 15#include <linux/vfs.h>
17#include <linux/nls.h> 16#include <linux/nls.h>
18 17
@@ -21,40 +20,11 @@ static void hfsplus_destroy_inode(struct inode *inode);
21 20
22#include "hfsplus_fs.h" 21#include "hfsplus_fs.h"
23 22
24struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino) 23static int hfsplus_system_read_inode(struct inode *inode)
25{ 24{
26 struct hfs_find_data fd; 25 struct hfsplus_vh *vhdr = HFSPLUS_SB(inode->i_sb)->s_vhdr;
27 struct hfsplus_vh *vhdr;
28 struct inode *inode;
29 long err = -EIO;
30
31 inode = iget_locked(sb, ino);
32 if (!inode)
33 return ERR_PTR(-ENOMEM);
34 if (!(inode->i_state & I_NEW))
35 return inode;
36 26
37 INIT_LIST_HEAD(&HFSPLUS_I(inode).open_dir_list); 27 switch (inode->i_ino) {
38 mutex_init(&HFSPLUS_I(inode).extents_lock);
39 HFSPLUS_I(inode).flags = 0;
40 HFSPLUS_I(inode).rsrc_inode = NULL;
41 atomic_set(&HFSPLUS_I(inode).opencnt, 0);
42
43 if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID) {
44 read_inode:
45 hfs_find_init(HFSPLUS_SB(inode->i_sb).cat_tree, &fd);
46 err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
47 if (!err)
48 err = hfsplus_cat_read_inode(inode, &fd);
49 hfs_find_exit(&fd);
50 if (err)
51 goto bad_inode;
52 goto done;
53 }
54 vhdr = HFSPLUS_SB(inode->i_sb).s_vhdr;
55 switch(inode->i_ino) {
56 case HFSPLUS_ROOT_CNID:
57 goto read_inode;
58 case HFSPLUS_EXT_CNID: 28 case HFSPLUS_EXT_CNID:
59 hfsplus_inode_read_fork(inode, &vhdr->ext_file); 29 hfsplus_inode_read_fork(inode, &vhdr->ext_file);
60 inode->i_mapping->a_ops = &hfsplus_btree_aops; 30 inode->i_mapping->a_ops = &hfsplus_btree_aops;
@@ -75,74 +45,101 @@ struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
75 inode->i_mapping->a_ops = &hfsplus_btree_aops; 45 inode->i_mapping->a_ops = &hfsplus_btree_aops;
76 break; 46 break;
77 default: 47 default:
78 goto bad_inode; 48 return -EIO;
49 }
50
51 return 0;
52}
53
54struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
55{
56 struct hfs_find_data fd;
57 struct inode *inode;
58 int err;
59
60 inode = iget_locked(sb, ino);
61 if (!inode)
62 return ERR_PTR(-ENOMEM);
63 if (!(inode->i_state & I_NEW))
64 return inode;
65
66 INIT_LIST_HEAD(&HFSPLUS_I(inode)->open_dir_list);
67 mutex_init(&HFSPLUS_I(inode)->extents_lock);
68 HFSPLUS_I(inode)->flags = 0;
69 HFSPLUS_I(inode)->rsrc_inode = NULL;
70 atomic_set(&HFSPLUS_I(inode)->opencnt, 0);
71
72 if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID ||
73 inode->i_ino == HFSPLUS_ROOT_CNID) {
74 hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd);
75 err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd);
76 if (!err)
77 err = hfsplus_cat_read_inode(inode, &fd);
78 hfs_find_exit(&fd);
79 } else {
80 err = hfsplus_system_read_inode(inode);
81 }
82
83 if (err) {
84 iget_failed(inode);
85 return ERR_PTR(err);
79 } 86 }
80 87
81done:
82 unlock_new_inode(inode); 88 unlock_new_inode(inode);
83 return inode; 89 return inode;
84
85bad_inode:
86 iget_failed(inode);
87 return ERR_PTR(err);
88} 90}
89 91
90static int hfsplus_write_inode(struct inode *inode, 92static int hfsplus_system_write_inode(struct inode *inode)
91 struct writeback_control *wbc)
92{ 93{
93 struct hfsplus_vh *vhdr; 94 struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb);
94 int ret = 0; 95 struct hfsplus_vh *vhdr = sbi->s_vhdr;
96 struct hfsplus_fork_raw *fork;
97 struct hfs_btree *tree = NULL;
95 98
96 dprint(DBG_INODE, "hfsplus_write_inode: %lu\n", inode->i_ino);
97 hfsplus_ext_write_extent(inode);
98 if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID) {
99 return hfsplus_cat_write_inode(inode);
100 }
101 vhdr = HFSPLUS_SB(inode->i_sb).s_vhdr;
102 switch (inode->i_ino) { 99 switch (inode->i_ino) {
103 case HFSPLUS_ROOT_CNID:
104 ret = hfsplus_cat_write_inode(inode);
105 break;
106 case HFSPLUS_EXT_CNID: 100 case HFSPLUS_EXT_CNID:
107 if (vhdr->ext_file.total_size != cpu_to_be64(inode->i_size)) { 101 fork = &vhdr->ext_file;
108 HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP; 102 tree = sbi->ext_tree;
109 inode->i_sb->s_dirt = 1;
110 }
111 hfsplus_inode_write_fork(inode, &vhdr->ext_file);
112 hfs_btree_write(HFSPLUS_SB(inode->i_sb).ext_tree);
113 break; 103 break;
114 case HFSPLUS_CAT_CNID: 104 case HFSPLUS_CAT_CNID:
115 if (vhdr->cat_file.total_size != cpu_to_be64(inode->i_size)) { 105 fork = &vhdr->cat_file;
116 HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP; 106 tree = sbi->cat_tree;
117 inode->i_sb->s_dirt = 1;
118 }
119 hfsplus_inode_write_fork(inode, &vhdr->cat_file);
120 hfs_btree_write(HFSPLUS_SB(inode->i_sb).cat_tree);
121 break; 107 break;
122 case HFSPLUS_ALLOC_CNID: 108 case HFSPLUS_ALLOC_CNID:
123 if (vhdr->alloc_file.total_size != cpu_to_be64(inode->i_size)) { 109 fork = &vhdr->alloc_file;
124 HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP;
125 inode->i_sb->s_dirt = 1;
126 }
127 hfsplus_inode_write_fork(inode, &vhdr->alloc_file);
128 break; 110 break;
129 case HFSPLUS_START_CNID: 111 case HFSPLUS_START_CNID:
130 if (vhdr->start_file.total_size != cpu_to_be64(inode->i_size)) { 112 fork = &vhdr->start_file;
131 HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP;
132 inode->i_sb->s_dirt = 1;
133 }
134 hfsplus_inode_write_fork(inode, &vhdr->start_file);
135 break; 113 break;
136 case HFSPLUS_ATTR_CNID: 114 case HFSPLUS_ATTR_CNID:
137 if (vhdr->attr_file.total_size != cpu_to_be64(inode->i_size)) { 115 fork = &vhdr->attr_file;
138 HFSPLUS_SB(inode->i_sb).flags |= HFSPLUS_SB_WRITEBACKUP; 116 tree = sbi->attr_tree;
139 inode->i_sb->s_dirt = 1; 117 default:
140 } 118 return -EIO;
141 hfsplus_inode_write_fork(inode, &vhdr->attr_file); 119 }
142 hfs_btree_write(HFSPLUS_SB(inode->i_sb).attr_tree); 120
143 break; 121 if (fork->total_size != cpu_to_be64(inode->i_size)) {
122 set_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags);
123 inode->i_sb->s_dirt = 1;
144 } 124 }
145 return ret; 125 hfsplus_inode_write_fork(inode, fork);
126 if (tree)
127 hfs_btree_write(tree);
128 return 0;
129}
130
131static int hfsplus_write_inode(struct inode *inode,
132 struct writeback_control *wbc)
133{
134 dprint(DBG_INODE, "hfsplus_write_inode: %lu\n", inode->i_ino);
135
136 hfsplus_ext_write_extent(inode);
137
138 if (inode->i_ino >= HFSPLUS_FIRSTUSER_CNID ||
139 inode->i_ino == HFSPLUS_ROOT_CNID)
140 return hfsplus_cat_write_inode(inode);
141 else
142 return hfsplus_system_write_inode(inode);
146} 143}
147 144
148static void hfsplus_evict_inode(struct inode *inode) 145static void hfsplus_evict_inode(struct inode *inode)
@@ -151,51 +148,53 @@ static void hfsplus_evict_inode(struct inode *inode)
151 truncate_inode_pages(&inode->i_data, 0); 148 truncate_inode_pages(&inode->i_data, 0);
152 end_writeback(inode); 149 end_writeback(inode);
153 if (HFSPLUS_IS_RSRC(inode)) { 150 if (HFSPLUS_IS_RSRC(inode)) {
154 HFSPLUS_I(HFSPLUS_I(inode).rsrc_inode).rsrc_inode = NULL; 151 HFSPLUS_I(HFSPLUS_I(inode)->rsrc_inode)->rsrc_inode = NULL;
155 iput(HFSPLUS_I(inode).rsrc_inode); 152 iput(HFSPLUS_I(inode)->rsrc_inode);
156 } 153 }
157} 154}
158 155
159int hfsplus_sync_fs(struct super_block *sb, int wait) 156int hfsplus_sync_fs(struct super_block *sb, int wait)
160{ 157{
161 struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr; 158 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
159 struct hfsplus_vh *vhdr = sbi->s_vhdr;
162 160
163 dprint(DBG_SUPER, "hfsplus_write_super\n"); 161 dprint(DBG_SUPER, "hfsplus_write_super\n");
164 162
165 lock_super(sb); 163 mutex_lock(&sbi->vh_mutex);
164 mutex_lock(&sbi->alloc_mutex);
166 sb->s_dirt = 0; 165 sb->s_dirt = 0;
167 166
168 vhdr->free_blocks = cpu_to_be32(HFSPLUS_SB(sb).free_blocks); 167 vhdr->free_blocks = cpu_to_be32(sbi->free_blocks);
169 vhdr->next_alloc = cpu_to_be32(HFSPLUS_SB(sb).next_alloc); 168 vhdr->next_cnid = cpu_to_be32(sbi->next_cnid);
170 vhdr->next_cnid = cpu_to_be32(HFSPLUS_SB(sb).next_cnid); 169 vhdr->folder_count = cpu_to_be32(sbi->folder_count);
171 vhdr->folder_count = cpu_to_be32(HFSPLUS_SB(sb).folder_count); 170 vhdr->file_count = cpu_to_be32(sbi->file_count);
172 vhdr->file_count = cpu_to_be32(HFSPLUS_SB(sb).file_count);
173 171
174 mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh); 172 mark_buffer_dirty(sbi->s_vhbh);
175 if (HFSPLUS_SB(sb).flags & HFSPLUS_SB_WRITEBACKUP) { 173 if (test_and_clear_bit(HFSPLUS_SB_WRITEBACKUP, &sbi->flags)) {
176 if (HFSPLUS_SB(sb).sect_count) { 174 if (sbi->sect_count) {
177 struct buffer_head *bh; 175 struct buffer_head *bh;
178 u32 block, offset; 176 u32 block, offset;
179 177
180 block = HFSPLUS_SB(sb).blockoffset; 178 block = sbi->blockoffset;
181 block += (HFSPLUS_SB(sb).sect_count - 2) >> (sb->s_blocksize_bits - 9); 179 block += (sbi->sect_count - 2) >> (sb->s_blocksize_bits - 9);
182 offset = ((HFSPLUS_SB(sb).sect_count - 2) << 9) & (sb->s_blocksize - 1); 180 offset = ((sbi->sect_count - 2) << 9) & (sb->s_blocksize - 1);
183 printk(KERN_DEBUG "hfs: backup: %u,%u,%u,%u\n", HFSPLUS_SB(sb).blockoffset, 181 printk(KERN_DEBUG "hfs: backup: %u,%u,%u,%u\n",
184 HFSPLUS_SB(sb).sect_count, block, offset); 182 sbi->blockoffset, sbi->sect_count,
183 block, offset);
185 bh = sb_bread(sb, block); 184 bh = sb_bread(sb, block);
186 if (bh) { 185 if (bh) {
187 vhdr = (struct hfsplus_vh *)(bh->b_data + offset); 186 vhdr = (struct hfsplus_vh *)(bh->b_data + offset);
188 if (be16_to_cpu(vhdr->signature) == HFSPLUS_VOLHEAD_SIG) { 187 if (be16_to_cpu(vhdr->signature) == HFSPLUS_VOLHEAD_SIG) {
189 memcpy(vhdr, HFSPLUS_SB(sb).s_vhdr, sizeof(*vhdr)); 188 memcpy(vhdr, sbi->s_vhdr, sizeof(*vhdr));
190 mark_buffer_dirty(bh); 189 mark_buffer_dirty(bh);
191 brelse(bh); 190 brelse(bh);
192 } else 191 } else
193 printk(KERN_WARNING "hfs: backup not found!\n"); 192 printk(KERN_WARNING "hfs: backup not found!\n");
194 } 193 }
195 } 194 }
196 HFSPLUS_SB(sb).flags &= ~HFSPLUS_SB_WRITEBACKUP;
197 } 195 }
198 unlock_super(sb); 196 mutex_unlock(&sbi->alloc_mutex);
197 mutex_unlock(&sbi->vh_mutex);
199 return 0; 198 return 0;
200} 199}
201 200
@@ -209,48 +208,48 @@ static void hfsplus_write_super(struct super_block *sb)
209 208
210static void hfsplus_put_super(struct super_block *sb) 209static void hfsplus_put_super(struct super_block *sb)
211{ 210{
211 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
212
212 dprint(DBG_SUPER, "hfsplus_put_super\n"); 213 dprint(DBG_SUPER, "hfsplus_put_super\n");
214
213 if (!sb->s_fs_info) 215 if (!sb->s_fs_info)
214 return; 216 return;
215 217
216 lock_kernel();
217
218 if (sb->s_dirt) 218 if (sb->s_dirt)
219 hfsplus_write_super(sb); 219 hfsplus_write_super(sb);
220 if (!(sb->s_flags & MS_RDONLY) && HFSPLUS_SB(sb).s_vhdr) { 220 if (!(sb->s_flags & MS_RDONLY) && sbi->s_vhdr) {
221 struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr; 221 struct hfsplus_vh *vhdr = sbi->s_vhdr;
222 222
223 vhdr->modify_date = hfsp_now2mt(); 223 vhdr->modify_date = hfsp_now2mt();
224 vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_UNMNT); 224 vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_UNMNT);
225 vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_INCNSTNT); 225 vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_INCNSTNT);
226 mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh); 226 mark_buffer_dirty(sbi->s_vhbh);
227 sync_dirty_buffer(HFSPLUS_SB(sb).s_vhbh); 227 sync_dirty_buffer(sbi->s_vhbh);
228 } 228 }
229 229
230 hfs_btree_close(HFSPLUS_SB(sb).cat_tree); 230 hfs_btree_close(sbi->cat_tree);
231 hfs_btree_close(HFSPLUS_SB(sb).ext_tree); 231 hfs_btree_close(sbi->ext_tree);
232 iput(HFSPLUS_SB(sb).alloc_file); 232 iput(sbi->alloc_file);
233 iput(HFSPLUS_SB(sb).hidden_dir); 233 iput(sbi->hidden_dir);
234 brelse(HFSPLUS_SB(sb).s_vhbh); 234 brelse(sbi->s_vhbh);
235 unload_nls(HFSPLUS_SB(sb).nls); 235 unload_nls(sbi->nls);
236 kfree(sb->s_fs_info); 236 kfree(sb->s_fs_info);
237 sb->s_fs_info = NULL; 237 sb->s_fs_info = NULL;
238
239 unlock_kernel();
240} 238}
241 239
242static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf) 240static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
243{ 241{
244 struct super_block *sb = dentry->d_sb; 242 struct super_block *sb = dentry->d_sb;
243 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
245 u64 id = huge_encode_dev(sb->s_bdev->bd_dev); 244 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
246 245
247 buf->f_type = HFSPLUS_SUPER_MAGIC; 246 buf->f_type = HFSPLUS_SUPER_MAGIC;
248 buf->f_bsize = sb->s_blocksize; 247 buf->f_bsize = sb->s_blocksize;
249 buf->f_blocks = HFSPLUS_SB(sb).total_blocks << HFSPLUS_SB(sb).fs_shift; 248 buf->f_blocks = sbi->total_blocks << sbi->fs_shift;
250 buf->f_bfree = HFSPLUS_SB(sb).free_blocks << HFSPLUS_SB(sb).fs_shift; 249 buf->f_bfree = sbi->free_blocks << sbi->fs_shift;
251 buf->f_bavail = buf->f_bfree; 250 buf->f_bavail = buf->f_bfree;
252 buf->f_files = 0xFFFFFFFF; 251 buf->f_files = 0xFFFFFFFF;
253 buf->f_ffree = 0xFFFFFFFF - HFSPLUS_SB(sb).next_cnid; 252 buf->f_ffree = 0xFFFFFFFF - sbi->next_cnid;
254 buf->f_fsid.val[0] = (u32)id; 253 buf->f_fsid.val[0] = (u32)id;
255 buf->f_fsid.val[1] = (u32)(id >> 32); 254 buf->f_fsid.val[1] = (u32)(id >> 32);
256 buf->f_namelen = HFSPLUS_MAX_STRLEN; 255 buf->f_namelen = HFSPLUS_MAX_STRLEN;
@@ -263,11 +262,11 @@ static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
263 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) 262 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
264 return 0; 263 return 0;
265 if (!(*flags & MS_RDONLY)) { 264 if (!(*flags & MS_RDONLY)) {
266 struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr; 265 struct hfsplus_vh *vhdr = HFSPLUS_SB(sb)->s_vhdr;
267 struct hfsplus_sb_info sbi; 266 struct hfsplus_sb_info sbi;
268 267
269 memset(&sbi, 0, sizeof(struct hfsplus_sb_info)); 268 memset(&sbi, 0, sizeof(struct hfsplus_sb_info));
270 sbi.nls = HFSPLUS_SB(sb).nls; 269 sbi.nls = HFSPLUS_SB(sb)->nls;
271 if (!hfsplus_parse_options(data, &sbi)) 270 if (!hfsplus_parse_options(data, &sbi))
272 return -EINVAL; 271 return -EINVAL;
273 272
@@ -276,7 +275,7 @@ static int hfsplus_remount(struct super_block *sb, int *flags, char *data)
276 "running fsck.hfsplus is recommended. leaving read-only.\n"); 275 "running fsck.hfsplus is recommended. leaving read-only.\n");
277 sb->s_flags |= MS_RDONLY; 276 sb->s_flags |= MS_RDONLY;
278 *flags |= MS_RDONLY; 277 *flags |= MS_RDONLY;
279 } else if (sbi.flags & HFSPLUS_SB_FORCE) { 278 } else if (test_bit(HFSPLUS_SB_FORCE, &sbi.flags)) {
280 /* nothing */ 279 /* nothing */
281 } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) { 280 } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
282 printk(KERN_WARNING "hfs: filesystem is marked locked, leaving read-only.\n"); 281 printk(KERN_WARNING "hfs: filesystem is marked locked, leaving read-only.\n");
@@ -320,7 +319,8 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
320 return -ENOMEM; 319 return -ENOMEM;
321 320
322 sb->s_fs_info = sbi; 321 sb->s_fs_info = sbi;
323 INIT_HLIST_HEAD(&sbi->rsrc_inodes); 322 mutex_init(&sbi->alloc_mutex);
323 mutex_init(&sbi->vh_mutex);
324 hfsplus_fill_defaults(sbi); 324 hfsplus_fill_defaults(sbi);
325 if (!hfsplus_parse_options(data, sbi)) { 325 if (!hfsplus_parse_options(data, sbi)) {
326 printk(KERN_ERR "hfs: unable to parse mount options\n"); 326 printk(KERN_ERR "hfs: unable to parse mount options\n");
@@ -344,7 +344,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
344 err = -EINVAL; 344 err = -EINVAL;
345 goto cleanup; 345 goto cleanup;
346 } 346 }
347 vhdr = HFSPLUS_SB(sb).s_vhdr; 347 vhdr = sbi->s_vhdr;
348 348
349 /* Copy parts of the volume header into the superblock */ 349 /* Copy parts of the volume header into the superblock */
350 sb->s_magic = HFSPLUS_VOLHEAD_SIG; 350 sb->s_magic = HFSPLUS_VOLHEAD_SIG;
@@ -353,18 +353,19 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
353 printk(KERN_ERR "hfs: wrong filesystem version\n"); 353 printk(KERN_ERR "hfs: wrong filesystem version\n");
354 goto cleanup; 354 goto cleanup;
355 } 355 }
356 HFSPLUS_SB(sb).total_blocks = be32_to_cpu(vhdr->total_blocks); 356 sbi->total_blocks = be32_to_cpu(vhdr->total_blocks);
357 HFSPLUS_SB(sb).free_blocks = be32_to_cpu(vhdr->free_blocks); 357 sbi->free_blocks = be32_to_cpu(vhdr->free_blocks);
358 HFSPLUS_SB(sb).next_alloc = be32_to_cpu(vhdr->next_alloc); 358 sbi->next_cnid = be32_to_cpu(vhdr->next_cnid);
359 HFSPLUS_SB(sb).next_cnid = be32_to_cpu(vhdr->next_cnid); 359 sbi->file_count = be32_to_cpu(vhdr->file_count);
360 HFSPLUS_SB(sb).file_count = be32_to_cpu(vhdr->file_count); 360 sbi->folder_count = be32_to_cpu(vhdr->folder_count);
361 HFSPLUS_SB(sb).folder_count = be32_to_cpu(vhdr->folder_count); 361 sbi->data_clump_blocks =
362 HFSPLUS_SB(sb).data_clump_blocks = be32_to_cpu(vhdr->data_clump_sz) >> HFSPLUS_SB(sb).alloc_blksz_shift; 362 be32_to_cpu(vhdr->data_clump_sz) >> sbi->alloc_blksz_shift;
363 if (!HFSPLUS_SB(sb).data_clump_blocks) 363 if (!sbi->data_clump_blocks)
364 HFSPLUS_SB(sb).data_clump_blocks = 1; 364 sbi->data_clump_blocks = 1;
365 HFSPLUS_SB(sb).rsrc_clump_blocks = be32_to_cpu(vhdr->rsrc_clump_sz) >> HFSPLUS_SB(sb).alloc_blksz_shift; 365 sbi->rsrc_clump_blocks =
366 if (!HFSPLUS_SB(sb).rsrc_clump_blocks) 366 be32_to_cpu(vhdr->rsrc_clump_sz) >> sbi->alloc_blksz_shift;
367 HFSPLUS_SB(sb).rsrc_clump_blocks = 1; 367 if (!sbi->rsrc_clump_blocks)
368 sbi->rsrc_clump_blocks = 1;
368 369
369 /* Set up operations so we can load metadata */ 370 /* Set up operations so we can load metadata */
370 sb->s_op = &hfsplus_sops; 371 sb->s_op = &hfsplus_sops;
@@ -374,7 +375,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
374 printk(KERN_WARNING "hfs: Filesystem was not cleanly unmounted, " 375 printk(KERN_WARNING "hfs: Filesystem was not cleanly unmounted, "
375 "running fsck.hfsplus is recommended. mounting read-only.\n"); 376 "running fsck.hfsplus is recommended. mounting read-only.\n");
376 sb->s_flags |= MS_RDONLY; 377 sb->s_flags |= MS_RDONLY;
377 } else if (sbi->flags & HFSPLUS_SB_FORCE) { 378 } else if (test_and_clear_bit(HFSPLUS_SB_FORCE, &sbi->flags)) {
378 /* nothing */ 379 /* nothing */
379 } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) { 380 } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) {
380 printk(KERN_WARNING "hfs: Filesystem is marked locked, mounting read-only.\n"); 381 printk(KERN_WARNING "hfs: Filesystem is marked locked, mounting read-only.\n");
@@ -384,16 +385,15 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
384 "use the force option at your own risk, mounting read-only.\n"); 385 "use the force option at your own risk, mounting read-only.\n");
385 sb->s_flags |= MS_RDONLY; 386 sb->s_flags |= MS_RDONLY;
386 } 387 }
387 sbi->flags &= ~HFSPLUS_SB_FORCE;
388 388
389 /* Load metadata objects (B*Trees) */ 389 /* Load metadata objects (B*Trees) */
390 HFSPLUS_SB(sb).ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID); 390 sbi->ext_tree = hfs_btree_open(sb, HFSPLUS_EXT_CNID);
391 if (!HFSPLUS_SB(sb).ext_tree) { 391 if (!sbi->ext_tree) {
392 printk(KERN_ERR "hfs: failed to load extents file\n"); 392 printk(KERN_ERR "hfs: failed to load extents file\n");
393 goto cleanup; 393 goto cleanup;
394 } 394 }
395 HFSPLUS_SB(sb).cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID); 395 sbi->cat_tree = hfs_btree_open(sb, HFSPLUS_CAT_CNID);
396 if (!HFSPLUS_SB(sb).cat_tree) { 396 if (!sbi->cat_tree) {
397 printk(KERN_ERR "hfs: failed to load catalog file\n"); 397 printk(KERN_ERR "hfs: failed to load catalog file\n");
398 goto cleanup; 398 goto cleanup;
399 } 399 }
@@ -404,7 +404,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
404 err = PTR_ERR(inode); 404 err = PTR_ERR(inode);
405 goto cleanup; 405 goto cleanup;
406 } 406 }
407 HFSPLUS_SB(sb).alloc_file = inode; 407 sbi->alloc_file = inode;
408 408
409 /* Load the root directory */ 409 /* Load the root directory */
410 root = hfsplus_iget(sb, HFSPLUS_ROOT_CNID); 410 root = hfsplus_iget(sb, HFSPLUS_ROOT_CNID);
@@ -423,7 +423,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
423 423
424 str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1; 424 str.len = sizeof(HFSP_HIDDENDIR_NAME) - 1;
425 str.name = HFSP_HIDDENDIR_NAME; 425 str.name = HFSP_HIDDENDIR_NAME;
426 hfs_find_init(HFSPLUS_SB(sb).cat_tree, &fd); 426 hfs_find_init(sbi->cat_tree, &fd);
427 hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_ROOT_CNID, &str); 427 hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_ROOT_CNID, &str);
428 if (!hfs_brec_read(&fd, &entry, sizeof(entry))) { 428 if (!hfs_brec_read(&fd, &entry, sizeof(entry))) {
429 hfs_find_exit(&fd); 429 hfs_find_exit(&fd);
@@ -434,7 +434,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
434 err = PTR_ERR(inode); 434 err = PTR_ERR(inode);
435 goto cleanup; 435 goto cleanup;
436 } 436 }
437 HFSPLUS_SB(sb).hidden_dir = inode; 437 sbi->hidden_dir = inode;
438 } else 438 } else
439 hfs_find_exit(&fd); 439 hfs_find_exit(&fd);
440 440
@@ -449,15 +449,19 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
449 be32_add_cpu(&vhdr->write_count, 1); 449 be32_add_cpu(&vhdr->write_count, 1);
450 vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT); 450 vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT);
451 vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT); 451 vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT);
452 mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh); 452 mark_buffer_dirty(sbi->s_vhbh);
453 sync_dirty_buffer(HFSPLUS_SB(sb).s_vhbh); 453 sync_dirty_buffer(sbi->s_vhbh);
454 454
455 if (!HFSPLUS_SB(sb).hidden_dir) { 455 if (!sbi->hidden_dir) {
456 printk(KERN_DEBUG "hfs: create hidden dir...\n"); 456 printk(KERN_DEBUG "hfs: create hidden dir...\n");
457 HFSPLUS_SB(sb).hidden_dir = hfsplus_new_inode(sb, S_IFDIR); 457
458 hfsplus_create_cat(HFSPLUS_SB(sb).hidden_dir->i_ino, sb->s_root->d_inode, 458 mutex_lock(&sbi->vh_mutex);
459 &str, HFSPLUS_SB(sb).hidden_dir); 459 sbi->hidden_dir = hfsplus_new_inode(sb, S_IFDIR);
460 mark_inode_dirty(HFSPLUS_SB(sb).hidden_dir); 460 hfsplus_create_cat(sbi->hidden_dir->i_ino, sb->s_root->d_inode,
461 &str, sbi->hidden_dir);
462 mutex_unlock(&sbi->vh_mutex);
463
464 mark_inode_dirty(sbi->hidden_dir);
461 } 465 }
462out: 466out:
463 unload_nls(sbi->nls); 467 unload_nls(sbi->nls);
@@ -486,7 +490,7 @@ static struct inode *hfsplus_alloc_inode(struct super_block *sb)
486 490
487static void hfsplus_destroy_inode(struct inode *inode) 491static void hfsplus_destroy_inode(struct inode *inode)
488{ 492{
489 kmem_cache_free(hfsplus_inode_cachep, &HFSPLUS_I(inode)); 493 kmem_cache_free(hfsplus_inode_cachep, HFSPLUS_I(inode));
490} 494}
491 495
492#define HFSPLUS_INODE_SIZE sizeof(struct hfsplus_inode_info) 496#define HFSPLUS_INODE_SIZE sizeof(struct hfsplus_inode_info)
diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c
index 628ccf6fa402..b66d67de882c 100644
--- a/fs/hfsplus/unicode.c
+++ b/fs/hfsplus/unicode.c
@@ -121,7 +121,7 @@ static u16 *hfsplus_compose_lookup(u16 *p, u16 cc)
121int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, char *astr, int *len_p) 121int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, char *astr, int *len_p)
122{ 122{
123 const hfsplus_unichr *ip; 123 const hfsplus_unichr *ip;
124 struct nls_table *nls = HFSPLUS_SB(sb).nls; 124 struct nls_table *nls = HFSPLUS_SB(sb)->nls;
125 u8 *op; 125 u8 *op;
126 u16 cc, c0, c1; 126 u16 cc, c0, c1;
127 u16 *ce1, *ce2; 127 u16 *ce1, *ce2;
@@ -132,7 +132,7 @@ int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, c
132 ustrlen = be16_to_cpu(ustr->length); 132 ustrlen = be16_to_cpu(ustr->length);
133 len = *len_p; 133 len = *len_p;
134 ce1 = NULL; 134 ce1 = NULL;
135 compose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE); 135 compose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
136 136
137 while (ustrlen > 0) { 137 while (ustrlen > 0) {
138 c0 = be16_to_cpu(*ip++); 138 c0 = be16_to_cpu(*ip++);
@@ -246,7 +246,7 @@ out:
246static inline int asc2unichar(struct super_block *sb, const char *astr, int len, 246static inline int asc2unichar(struct super_block *sb, const char *astr, int len,
247 wchar_t *uc) 247 wchar_t *uc)
248{ 248{
249 int size = HFSPLUS_SB(sb).nls->char2uni(astr, len, uc); 249 int size = HFSPLUS_SB(sb)->nls->char2uni(astr, len, uc);
250 if (size <= 0) { 250 if (size <= 0) {
251 *uc = '?'; 251 *uc = '?';
252 size = 1; 252 size = 1;
@@ -293,7 +293,7 @@ int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr,
293 u16 *dstr, outlen = 0; 293 u16 *dstr, outlen = 0;
294 wchar_t c; 294 wchar_t c;
295 295
296 decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE); 296 decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
297 while (outlen < HFSPLUS_MAX_STRLEN && len > 0) { 297 while (outlen < HFSPLUS_MAX_STRLEN && len > 0) {
298 size = asc2unichar(sb, astr, len, &c); 298 size = asc2unichar(sb, astr, len, &c);
299 299
@@ -330,8 +330,8 @@ int hfsplus_hash_dentry(struct dentry *dentry, struct qstr *str)
330 wchar_t c; 330 wchar_t c;
331 u16 c2; 331 u16 c2;
332 332
333 casefold = (HFSPLUS_SB(sb).flags & HFSPLUS_SB_CASEFOLD); 333 casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
334 decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE); 334 decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
335 hash = init_name_hash(); 335 hash = init_name_hash();
336 astr = str->name; 336 astr = str->name;
337 len = str->len; 337 len = str->len;
@@ -373,8 +373,8 @@ int hfsplus_compare_dentry(struct dentry *dentry, struct qstr *s1, struct qstr *
373 u16 c1, c2; 373 u16 c1, c2;
374 wchar_t c; 374 wchar_t c;
375 375
376 casefold = (HFSPLUS_SB(sb).flags & HFSPLUS_SB_CASEFOLD); 376 casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
377 decompose = !(HFSPLUS_SB(sb).flags & HFSPLUS_SB_NODECOMPOSE); 377 decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
378 astr1 = s1->name; 378 astr1 = s1->name;
379 len1 = s1->len; 379 len1 = s1->len;
380 astr2 = s2->name; 380 astr2 = s2->name;
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index bed78ac8f6d1..8972c20b3216 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -65,8 +65,8 @@ static int hfsplus_get_last_session(struct super_block *sb,
65 *start = 0; 65 *start = 0;
66 *size = sb->s_bdev->bd_inode->i_size >> 9; 66 *size = sb->s_bdev->bd_inode->i_size >> 9;
67 67
68 if (HFSPLUS_SB(sb).session >= 0) { 68 if (HFSPLUS_SB(sb)->session >= 0) {
69 te.cdte_track = HFSPLUS_SB(sb).session; 69 te.cdte_track = HFSPLUS_SB(sb)->session;
70 te.cdte_format = CDROM_LBA; 70 te.cdte_format = CDROM_LBA;
71 res = ioctl_by_bdev(sb->s_bdev, CDROMREADTOCENTRY, (unsigned long)&te); 71 res = ioctl_by_bdev(sb->s_bdev, CDROMREADTOCENTRY, (unsigned long)&te);
72 if (!res && (te.cdte_ctrl & CDROM_DATA_TRACK) == 4) { 72 if (!res && (te.cdte_ctrl & CDROM_DATA_TRACK) == 4) {
@@ -87,6 +87,7 @@ static int hfsplus_get_last_session(struct super_block *sb,
87/* Takes in super block, returns true if good data read */ 87/* Takes in super block, returns true if good data read */
88int hfsplus_read_wrapper(struct super_block *sb) 88int hfsplus_read_wrapper(struct super_block *sb)
89{ 89{
90 struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
90 struct buffer_head *bh; 91 struct buffer_head *bh;
91 struct hfsplus_vh *vhdr; 92 struct hfsplus_vh *vhdr;
92 struct hfsplus_wd wd; 93 struct hfsplus_wd wd;
@@ -122,7 +123,7 @@ int hfsplus_read_wrapper(struct super_block *sb)
122 if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIG)) 123 if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIG))
123 break; 124 break;
124 if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIGX)) { 125 if (vhdr->signature == cpu_to_be16(HFSPLUS_VOLHEAD_SIGX)) {
125 HFSPLUS_SB(sb).flags |= HFSPLUS_SB_HFSX; 126 set_bit(HFSPLUS_SB_HFSX, &sbi->flags);
126 break; 127 break;
127 } 128 }
128 brelse(bh); 129 brelse(bh);
@@ -143,11 +144,11 @@ int hfsplus_read_wrapper(struct super_block *sb)
143 if (blocksize < HFSPLUS_SECTOR_SIZE || 144 if (blocksize < HFSPLUS_SECTOR_SIZE ||
144 ((blocksize - 1) & blocksize)) 145 ((blocksize - 1) & blocksize))
145 return -EINVAL; 146 return -EINVAL;
146 HFSPLUS_SB(sb).alloc_blksz = blocksize; 147 sbi->alloc_blksz = blocksize;
147 HFSPLUS_SB(sb).alloc_blksz_shift = 0; 148 sbi->alloc_blksz_shift = 0;
148 while ((blocksize >>= 1) != 0) 149 while ((blocksize >>= 1) != 0)
149 HFSPLUS_SB(sb).alloc_blksz_shift++; 150 sbi->alloc_blksz_shift++;
150 blocksize = min(HFSPLUS_SB(sb).alloc_blksz, (u32)PAGE_SIZE); 151 blocksize = min(sbi->alloc_blksz, (u32)PAGE_SIZE);
151 152
152 /* align block size to block offset */ 153 /* align block size to block offset */
153 while (part_start & ((blocksize >> HFSPLUS_SECTOR_SHIFT) - 1)) 154 while (part_start & ((blocksize >> HFSPLUS_SECTOR_SHIFT) - 1))
@@ -158,23 +159,26 @@ int hfsplus_read_wrapper(struct super_block *sb)
158 return -EINVAL; 159 return -EINVAL;
159 } 160 }
160 161
161 HFSPLUS_SB(sb).blockoffset = part_start >> 162 sbi->blockoffset =
162 (sb->s_blocksize_bits - HFSPLUS_SECTOR_SHIFT); 163 part_start >> (sb->s_blocksize_bits - HFSPLUS_SECTOR_SHIFT);
163 HFSPLUS_SB(sb).sect_count = part_size; 164 sbi->sect_count = part_size;
164 HFSPLUS_SB(sb).fs_shift = HFSPLUS_SB(sb).alloc_blksz_shift - 165 sbi->fs_shift = sbi->alloc_blksz_shift - sb->s_blocksize_bits;
165 sb->s_blocksize_bits;
166 166
167 bh = sb_bread512(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, vhdr); 167 bh = sb_bread512(sb, part_start + HFSPLUS_VOLHEAD_SECTOR, vhdr);
168 if (!bh) 168 if (!bh)
169 return -EIO; 169 return -EIO;
170 170
171 /* should still be the same... */ 171 /* should still be the same... */
172 if (vhdr->signature != (HFSPLUS_SB(sb).flags & HFSPLUS_SB_HFSX ? 172 if (test_bit(HFSPLUS_SB_HFSX, &sbi->flags)) {
173 cpu_to_be16(HFSPLUS_VOLHEAD_SIGX) : 173 if (vhdr->signature != cpu_to_be16(HFSPLUS_VOLHEAD_SIGX))
174 cpu_to_be16(HFSPLUS_VOLHEAD_SIG))) 174 goto error;
175 goto error; 175 } else {
176 HFSPLUS_SB(sb).s_vhbh = bh; 176 if (vhdr->signature != cpu_to_be16(HFSPLUS_VOLHEAD_SIG))
177 HFSPLUS_SB(sb).s_vhdr = vhdr; 177 goto error;
178 }
179
180 sbi->s_vhbh = bh;
181 sbi->s_vhdr = vhdr;
178 182
179 return 0; 183 return 0;
180 error: 184 error:
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index e2bd73e8f9c0..f4d4120e5128 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -129,6 +129,10 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres
129#define move_pte(pte, prot, old_addr, new_addr) (pte) 129#define move_pte(pte, prot, old_addr, new_addr) (pte)
130#endif 130#endif
131 131
132#ifndef flush_tlb_fix_spurious_fault
133#define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address)
134#endif
135
132#ifndef pgprot_noncached 136#ifndef pgprot_noncached
133#define pgprot_noncached(prot) (prot) 137#define pgprot_noncached(prot) (prot)
134#endif 138#endif
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index ef2af9948eac..f4229fb315e1 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -687,7 +687,9 @@
687 - LOAD_OFFSET) { \ 687 - LOAD_OFFSET) { \
688 VMLINUX_SYMBOL(__per_cpu_start) = .; \ 688 VMLINUX_SYMBOL(__per_cpu_start) = .; \
689 *(.data..percpu..first) \ 689 *(.data..percpu..first) \
690 . = ALIGN(PAGE_SIZE); \
690 *(.data..percpu..page_aligned) \ 691 *(.data..percpu..page_aligned) \
692 *(.data..percpu..readmostly) \
691 *(.data..percpu) \ 693 *(.data..percpu) \
692 *(.data..percpu..shared_aligned) \ 694 *(.data..percpu..shared_aligned) \
693 VMLINUX_SYMBOL(__per_cpu_end) = .; \ 695 VMLINUX_SYMBOL(__per_cpu_end) = .; \
@@ -713,7 +715,9 @@
713 VMLINUX_SYMBOL(__per_cpu_load) = .; \ 715 VMLINUX_SYMBOL(__per_cpu_load) = .; \
714 VMLINUX_SYMBOL(__per_cpu_start) = .; \ 716 VMLINUX_SYMBOL(__per_cpu_start) = .; \
715 *(.data..percpu..first) \ 717 *(.data..percpu..first) \
718 . = ALIGN(PAGE_SIZE); \
716 *(.data..percpu..page_aligned) \ 719 *(.data..percpu..page_aligned) \
720 *(.data..percpu..readmostly) \
717 *(.data..percpu) \ 721 *(.data..percpu) \
718 *(.data..percpu..shared_aligned) \ 722 *(.data..percpu..shared_aligned) \
719 VMLINUX_SYMBOL(__per_cpu_end) = .; \ 723 VMLINUX_SYMBOL(__per_cpu_end) = .; \
diff --git a/include/linux/acpi_pmtmr.h b/include/linux/acpi_pmtmr.h
index 7e3d2859be50..1d0ef1ae8036 100644
--- a/include/linux/acpi_pmtmr.h
+++ b/include/linux/acpi_pmtmr.h
@@ -25,8 +25,6 @@ static inline u32 acpi_pm_read_early(void)
25 return acpi_pm_read_verified() & ACPI_PM_MASK; 25 return acpi_pm_read_verified() & ACPI_PM_MASK;
26} 26}
27 27
28extern void pmtimer_wait(unsigned);
29
30#else 28#else
31 29
32static inline u32 acpi_pm_read_early(void) 30static inline u32 acpi_pm_read_early(void)
diff --git a/fs/ceph/auth.h b/include/linux/ceph/auth.h
index d38a2fb4a137..7fff521d7eb5 100644
--- a/fs/ceph/auth.h
+++ b/include/linux/ceph/auth.h
@@ -1,8 +1,8 @@
1#ifndef _FS_CEPH_AUTH_H 1#ifndef _FS_CEPH_AUTH_H
2#define _FS_CEPH_AUTH_H 2#define _FS_CEPH_AUTH_H
3 3
4#include "types.h" 4#include <linux/ceph/types.h>
5#include "buffer.h" 5#include <linux/ceph/buffer.h>
6 6
7/* 7/*
8 * Abstract interface for communicating with the authenticate module. 8 * Abstract interface for communicating with the authenticate module.
diff --git a/fs/ceph/buffer.h b/include/linux/ceph/buffer.h
index 58d19014068f..58d19014068f 100644
--- a/fs/ceph/buffer.h
+++ b/include/linux/ceph/buffer.h
diff --git a/fs/ceph/ceph_debug.h b/include/linux/ceph/ceph_debug.h
index 1818c2305610..aa2e19182d99 100644
--- a/fs/ceph/ceph_debug.h
+++ b/include/linux/ceph/ceph_debug.h
@@ -3,7 +3,7 @@
3 3
4#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 4#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
5 5
6#ifdef CONFIG_CEPH_FS_PRETTYDEBUG 6#ifdef CONFIG_CEPH_LIB_PRETTYDEBUG
7 7
8/* 8/*
9 * wrap pr_debug to include a filename:lineno prefix on each line. 9 * wrap pr_debug to include a filename:lineno prefix on each line.
@@ -14,7 +14,8 @@
14# if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG) 14# if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
15extern const char *ceph_file_part(const char *s, int len); 15extern const char *ceph_file_part(const char *s, int len);
16# define dout(fmt, ...) \ 16# define dout(fmt, ...) \
17 pr_debug(" %12.12s:%-4d : " fmt, \ 17 pr_debug("%.*s %12.12s:%-4d : " fmt, \
18 8 - (int)sizeof(KBUILD_MODNAME), " ", \
18 ceph_file_part(__FILE__, sizeof(__FILE__)), \ 19 ceph_file_part(__FILE__, sizeof(__FILE__)), \
19 __LINE__, ##__VA_ARGS__) 20 __LINE__, ##__VA_ARGS__)
20# else 21# else
diff --git a/fs/ceph/ceph_frag.h b/include/linux/ceph/ceph_frag.h
index 5babb8e95352..5babb8e95352 100644
--- a/fs/ceph/ceph_frag.h
+++ b/include/linux/ceph/ceph_frag.h
diff --git a/fs/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index d5619ac86711..c3c74aef289d 100644
--- a/fs/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -299,6 +299,7 @@ enum {
299 CEPH_MDS_OP_SETATTR = 0x01108, 299 CEPH_MDS_OP_SETATTR = 0x01108,
300 CEPH_MDS_OP_SETFILELOCK= 0x01109, 300 CEPH_MDS_OP_SETFILELOCK= 0x01109,
301 CEPH_MDS_OP_GETFILELOCK= 0x00110, 301 CEPH_MDS_OP_GETFILELOCK= 0x00110,
302 CEPH_MDS_OP_SETDIRLAYOUT=0x0110a,
302 303
303 CEPH_MDS_OP_MKNOD = 0x01201, 304 CEPH_MDS_OP_MKNOD = 0x01201,
304 CEPH_MDS_OP_LINK = 0x01202, 305 CEPH_MDS_OP_LINK = 0x01202,
diff --git a/fs/ceph/ceph_hash.h b/include/linux/ceph/ceph_hash.h
index d099c3f90236..d099c3f90236 100644
--- a/fs/ceph/ceph_hash.h
+++ b/include/linux/ceph/ceph_hash.h
diff --git a/include/linux/ceph/debugfs.h b/include/linux/ceph/debugfs.h
new file mode 100644
index 000000000000..2a79702e092b
--- /dev/null
+++ b/include/linux/ceph/debugfs.h
@@ -0,0 +1,33 @@
1#ifndef _FS_CEPH_DEBUGFS_H
2#define _FS_CEPH_DEBUGFS_H
3
4#include "ceph_debug.h"
5#include "types.h"
6
7#define CEPH_DEFINE_SHOW_FUNC(name) \
8static int name##_open(struct inode *inode, struct file *file) \
9{ \
10 struct seq_file *sf; \
11 int ret; \
12 \
13 ret = single_open(file, name, NULL); \
14 sf = file->private_data; \
15 sf->private = inode->i_private; \
16 return ret; \
17} \
18 \
19static const struct file_operations name##_fops = { \
20 .open = name##_open, \
21 .read = seq_read, \
22 .llseek = seq_lseek, \
23 .release = single_release, \
24};
25
26/* debugfs.c */
27extern int ceph_debugfs_init(void);
28extern void ceph_debugfs_cleanup(void);
29extern int ceph_debugfs_client_init(struct ceph_client *client);
30extern void ceph_debugfs_client_cleanup(struct ceph_client *client);
31
32#endif
33
diff --git a/fs/ceph/decode.h b/include/linux/ceph/decode.h
index 3d25415afe63..c5b6939fb32a 100644
--- a/fs/ceph/decode.h
+++ b/include/linux/ceph/decode.h
@@ -191,6 +191,11 @@ static inline void ceph_encode_string(void **p, void *end,
191 ceph_encode_need(p, end, n, bad); \ 191 ceph_encode_need(p, end, n, bad); \
192 ceph_encode_copy(p, pv, n); \ 192 ceph_encode_copy(p, pv, n); \
193 } while (0) 193 } while (0)
194#define ceph_encode_string_safe(p, end, s, n, bad) \
195 do { \
196 ceph_encode_need(p, end, n, bad); \
197 ceph_encode_string(p, end, s, n); \
198 } while (0)
194 199
195 200
196#endif 201#endif
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
new file mode 100644
index 000000000000..f22b2e941686
--- /dev/null
+++ b/include/linux/ceph/libceph.h
@@ -0,0 +1,249 @@
1#ifndef _FS_CEPH_LIBCEPH_H
2#define _FS_CEPH_LIBCEPH_H
3
4#include "ceph_debug.h"
5
6#include <asm/unaligned.h>
7#include <linux/backing-dev.h>
8#include <linux/completion.h>
9#include <linux/exportfs.h>
10#include <linux/fs.h>
11#include <linux/mempool.h>
12#include <linux/pagemap.h>
13#include <linux/wait.h>
14#include <linux/writeback.h>
15#include <linux/slab.h>
16
17#include "types.h"
18#include "messenger.h"
19#include "msgpool.h"
20#include "mon_client.h"
21#include "osd_client.h"
22#include "ceph_fs.h"
23
24/*
25 * Supported features
26 */
27#define CEPH_FEATURE_SUPPORTED_DEFAULT CEPH_FEATURE_NOSRCADDR
28#define CEPH_FEATURE_REQUIRED_DEFAULT CEPH_FEATURE_NOSRCADDR
29
30/*
31 * mount options
32 */
33#define CEPH_OPT_FSID (1<<0)
34#define CEPH_OPT_NOSHARE (1<<1) /* don't share client with other sbs */
35#define CEPH_OPT_MYIP (1<<2) /* specified my ip */
36#define CEPH_OPT_NOCRC (1<<3) /* no data crc on writes */
37
38#define CEPH_OPT_DEFAULT (0);
39
40#define ceph_set_opt(client, opt) \
41 (client)->options->flags |= CEPH_OPT_##opt;
42#define ceph_test_opt(client, opt) \
43 (!!((client)->options->flags & CEPH_OPT_##opt))
44
45struct ceph_options {
46 int flags;
47 struct ceph_fsid fsid;
48 struct ceph_entity_addr my_addr;
49 int mount_timeout;
50 int osd_idle_ttl;
51 int osd_timeout;
52 int osd_keepalive_timeout;
53
54 /*
55 * any type that can't be simply compared or doesn't need need
56 * to be compared should go beyond this point,
57 * ceph_compare_options() should be updated accordingly
58 */
59
60 struct ceph_entity_addr *mon_addr; /* should be the first
61 pointer type of args */
62 int num_mon;
63 char *name;
64 char *secret;
65};
66
67/*
68 * defaults
69 */
70#define CEPH_MOUNT_TIMEOUT_DEFAULT 60
71#define CEPH_OSD_TIMEOUT_DEFAULT 60 /* seconds */
72#define CEPH_OSD_KEEPALIVE_DEFAULT 5
73#define CEPH_OSD_IDLE_TTL_DEFAULT 60
74#define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */
75
76#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
77#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024)
78
79#define CEPH_AUTH_NAME_DEFAULT "guest"
80
81/*
82 * Delay telling the MDS we no longer want caps, in case we reopen
83 * the file. Delay a minimum amount of time, even if we send a cap
84 * message for some other reason. Otherwise, take the oppotunity to
85 * update the mds to avoid sending another message later.
86 */
87#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */
88#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */
89
90#define CEPH_CAP_RELEASE_SAFETY_DEFAULT (CEPH_CAPS_PER_RELEASE * 4)
91
92/* mount state */
93enum {
94 CEPH_MOUNT_MOUNTING,
95 CEPH_MOUNT_MOUNTED,
96 CEPH_MOUNT_UNMOUNTING,
97 CEPH_MOUNT_UNMOUNTED,
98 CEPH_MOUNT_SHUTDOWN,
99};
100
101/*
102 * subtract jiffies
103 */
104static inline unsigned long time_sub(unsigned long a, unsigned long b)
105{
106 BUG_ON(time_after(b, a));
107 return (long)a - (long)b;
108}
109
110struct ceph_mds_client;
111
112/*
113 * per client state
114 *
115 * possibly shared by multiple mount points, if they are
116 * mounting the same ceph filesystem/cluster.
117 */
118struct ceph_client {
119 struct ceph_fsid fsid;
120 bool have_fsid;
121
122 void *private;
123
124 struct ceph_options *options;
125
126 struct mutex mount_mutex; /* serialize mount attempts */
127 wait_queue_head_t auth_wq;
128 int auth_err;
129
130 int (*extra_mon_dispatch)(struct ceph_client *, struct ceph_msg *);
131
132 u32 supported_features;
133 u32 required_features;
134
135 struct ceph_messenger *msgr; /* messenger instance */
136 struct ceph_mon_client monc;
137 struct ceph_osd_client osdc;
138
139#ifdef CONFIG_DEBUG_FS
140 struct dentry *debugfs_dir;
141 struct dentry *debugfs_monmap;
142 struct dentry *debugfs_osdmap;
143#endif
144};
145
146
147
148/*
149 * snapshots
150 */
151
152/*
153 * A "snap context" is the set of existing snapshots when we
154 * write data. It is used by the OSD to guide its COW behavior.
155 *
156 * The ceph_snap_context is refcounted, and attached to each dirty
157 * page, indicating which context the dirty data belonged when it was
158 * dirtied.
159 */
160struct ceph_snap_context {
161 atomic_t nref;
162 u64 seq;
163 int num_snaps;
164 u64 snaps[];
165};
166
167static inline struct ceph_snap_context *
168ceph_get_snap_context(struct ceph_snap_context *sc)
169{
170 /*
171 printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
172 atomic_read(&sc->nref)+1);
173 */
174 if (sc)
175 atomic_inc(&sc->nref);
176 return sc;
177}
178
179static inline void ceph_put_snap_context(struct ceph_snap_context *sc)
180{
181 if (!sc)
182 return;
183 /*
184 printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
185 atomic_read(&sc->nref)-1);
186 */
187 if (atomic_dec_and_test(&sc->nref)) {
188 /*printk(" deleting snap_context %p\n", sc);*/
189 kfree(sc);
190 }
191}
192
193/*
194 * calculate the number of pages a given length and offset map onto,
195 * if we align the data.
196 */
197static inline int calc_pages_for(u64 off, u64 len)
198{
199 return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) -
200 (off >> PAGE_CACHE_SHIFT);
201}
202
203/* ceph_common.c */
204extern const char *ceph_msg_type_name(int type);
205extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
206extern struct kmem_cache *ceph_inode_cachep;
207extern struct kmem_cache *ceph_cap_cachep;
208extern struct kmem_cache *ceph_dentry_cachep;
209extern struct kmem_cache *ceph_file_cachep;
210
211extern int ceph_parse_options(struct ceph_options **popt, char *options,
212 const char *dev_name, const char *dev_name_end,
213 int (*parse_extra_token)(char *c, void *private),
214 void *private);
215extern void ceph_destroy_options(struct ceph_options *opt);
216extern int ceph_compare_options(struct ceph_options *new_opt,
217 struct ceph_client *client);
218extern struct ceph_client *ceph_create_client(struct ceph_options *opt,
219 void *private);
220extern u64 ceph_client_id(struct ceph_client *client);
221extern void ceph_destroy_client(struct ceph_client *client);
222extern int __ceph_open_session(struct ceph_client *client,
223 unsigned long started);
224extern int ceph_open_session(struct ceph_client *client);
225
226/* pagevec.c */
227extern void ceph_release_page_vector(struct page **pages, int num_pages);
228
229extern struct page **ceph_get_direct_page_vector(const char __user *data,
230 int num_pages,
231 loff_t off, size_t len);
232extern void ceph_put_page_vector(struct page **pages, int num_pages);
233extern void ceph_release_page_vector(struct page **pages, int num_pages);
234extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags);
235extern int ceph_copy_user_to_page_vector(struct page **pages,
236 const char __user *data,
237 loff_t off, size_t len);
238extern int ceph_copy_to_page_vector(struct page **pages,
239 const char *data,
240 loff_t off, size_t len);
241extern int ceph_copy_from_page_vector(struct page **pages,
242 char *data,
243 loff_t off, size_t len);
244extern int ceph_copy_page_vector_to_user(struct page **pages, char __user *data,
245 loff_t off, size_t len);
246extern void ceph_zero_page_vector_range(int off, int len, struct page **pages);
247
248
249#endif /* _FS_CEPH_SUPER_H */
diff --git a/fs/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h
index 4c5cb0880bba..4c5cb0880bba 100644
--- a/fs/ceph/mdsmap.h
+++ b/include/linux/ceph/mdsmap.h
diff --git a/fs/ceph/messenger.h b/include/linux/ceph/messenger.h
index 76fbc957bc13..5956d62c3057 100644
--- a/fs/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -65,6 +65,9 @@ struct ceph_messenger {
65 */ 65 */
66 u32 global_seq; 66 u32 global_seq;
67 spinlock_t global_seq_lock; 67 spinlock_t global_seq_lock;
68
69 u32 supported_features;
70 u32 required_features;
68}; 71};
69 72
70/* 73/*
@@ -82,6 +85,10 @@ struct ceph_msg {
82 struct ceph_pagelist *pagelist; /* instead of pages */ 85 struct ceph_pagelist *pagelist; /* instead of pages */
83 struct list_head list_head; 86 struct list_head list_head;
84 struct kref kref; 87 struct kref kref;
88 struct bio *bio; /* instead of pages/pagelist */
89 struct bio *bio_iter; /* bio iterator */
90 int bio_seg; /* current bio segment */
91 struct ceph_pagelist *trail; /* the trailing part of the data */
85 bool front_is_vmalloc; 92 bool front_is_vmalloc;
86 bool more_to_follow; 93 bool more_to_follow;
87 bool needs_out_seq; 94 bool needs_out_seq;
@@ -205,7 +212,7 @@ struct ceph_connection {
205}; 212};
206 213
207 214
208extern const char *pr_addr(const struct sockaddr_storage *ss); 215extern const char *ceph_pr_addr(const struct sockaddr_storage *ss);
209extern int ceph_parse_ips(const char *c, const char *end, 216extern int ceph_parse_ips(const char *c, const char *end,
210 struct ceph_entity_addr *addr, 217 struct ceph_entity_addr *addr,
211 int max_count, int *count); 218 int max_count, int *count);
@@ -216,7 +223,8 @@ extern void ceph_msgr_exit(void);
216extern void ceph_msgr_flush(void); 223extern void ceph_msgr_flush(void);
217 224
218extern struct ceph_messenger *ceph_messenger_create( 225extern struct ceph_messenger *ceph_messenger_create(
219 struct ceph_entity_addr *myaddr); 226 struct ceph_entity_addr *myaddr,
227 u32 features, u32 required);
220extern void ceph_messenger_destroy(struct ceph_messenger *); 228extern void ceph_messenger_destroy(struct ceph_messenger *);
221 229
222extern void ceph_con_init(struct ceph_messenger *msgr, 230extern void ceph_con_init(struct ceph_messenger *msgr,
diff --git a/fs/ceph/mon_client.h b/include/linux/ceph/mon_client.h
index 8e396f2c0963..545f85917780 100644
--- a/fs/ceph/mon_client.h
+++ b/include/linux/ceph/mon_client.h
@@ -79,6 +79,7 @@ struct ceph_mon_client {
79 u64 last_tid; 79 u64 last_tid;
80 80
81 /* mds/osd map */ 81 /* mds/osd map */
82 int want_mdsmap;
82 int want_next_osdmap; /* 1 = want, 2 = want+asked */ 83 int want_next_osdmap; /* 1 = want, 2 = want+asked */
83 u32 have_osdmap, have_mdsmap; 84 u32 have_osdmap, have_mdsmap;
84 85
diff --git a/fs/ceph/msgpool.h b/include/linux/ceph/msgpool.h
index a362605f9368..a362605f9368 100644
--- a/fs/ceph/msgpool.h
+++ b/include/linux/ceph/msgpool.h
diff --git a/fs/ceph/msgr.h b/include/linux/ceph/msgr.h
index 680d3d648cac..680d3d648cac 100644
--- a/fs/ceph/msgr.h
+++ b/include/linux/ceph/msgr.h
diff --git a/fs/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index ce776989ef6a..6c91fb032c39 100644
--- a/fs/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -15,6 +15,7 @@ struct ceph_snap_context;
15struct ceph_osd_request; 15struct ceph_osd_request;
16struct ceph_osd_client; 16struct ceph_osd_client;
17struct ceph_authorizer; 17struct ceph_authorizer;
18struct ceph_pagelist;
18 19
19/* 20/*
20 * completion callback for async writepages 21 * completion callback for async writepages
@@ -68,6 +69,7 @@ struct ceph_osd_request {
68 struct list_head r_unsafe_item; 69 struct list_head r_unsafe_item;
69 70
70 struct inode *r_inode; /* for use by callbacks */ 71 struct inode *r_inode; /* for use by callbacks */
72 void *r_priv; /* ditto */
71 73
72 char r_oid[40]; /* object name */ 74 char r_oid[40]; /* object name */
73 int r_oid_len; 75 int r_oid_len;
@@ -80,6 +82,11 @@ struct ceph_osd_request {
80 struct page **r_pages; /* pages for data payload */ 82 struct page **r_pages; /* pages for data payload */
81 int r_pages_from_pool; 83 int r_pages_from_pool;
82 int r_own_pages; /* if true, i own page list */ 84 int r_own_pages; /* if true, i own page list */
85#ifdef CONFIG_BLOCK
86 struct bio *r_bio; /* instead of pages */
87#endif
88
89 struct ceph_pagelist *r_trail; /* trailing part of the data */
83}; 90};
84 91
85struct ceph_osd_client { 92struct ceph_osd_client {
@@ -110,6 +117,42 @@ struct ceph_osd_client {
110 struct ceph_msgpool msgpool_op_reply; 117 struct ceph_msgpool msgpool_op_reply;
111}; 118};
112 119
120struct ceph_osd_req_op {
121 u16 op; /* CEPH_OSD_OP_* */
122 u32 flags; /* CEPH_OSD_FLAG_* */
123 union {
124 struct {
125 u64 offset, length;
126 u64 truncate_size;
127 u32 truncate_seq;
128 } extent;
129 struct {
130 const char *name;
131 u32 name_len;
132 const char *val;
133 u32 value_len;
134 __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */
135 __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */
136 } xattr;
137 struct {
138 const char *class_name;
139 __u8 class_len;
140 const char *method_name;
141 __u8 method_len;
142 __u8 argc;
143 const char *indata;
144 u32 indata_len;
145 } cls;
146 struct {
147 u64 cookie, count;
148 } pgls;
149 struct {
150 u64 snapid;
151 } snap;
152 };
153 u32 payload_len;
154};
155
113extern int ceph_osdc_init(struct ceph_osd_client *osdc, 156extern int ceph_osdc_init(struct ceph_osd_client *osdc,
114 struct ceph_client *client); 157 struct ceph_client *client);
115extern void ceph_osdc_stop(struct ceph_osd_client *osdc); 158extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
@@ -119,6 +162,30 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
119extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, 162extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
120 struct ceph_msg *msg); 163 struct ceph_msg *msg);
121 164
165extern void ceph_calc_raw_layout(struct ceph_osd_client *osdc,
166 struct ceph_file_layout *layout,
167 u64 snapid,
168 u64 off, u64 *plen, u64 *bno,
169 struct ceph_osd_request *req,
170 struct ceph_osd_req_op *op);
171
172extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
173 int flags,
174 struct ceph_snap_context *snapc,
175 struct ceph_osd_req_op *ops,
176 bool use_mempool,
177 gfp_t gfp_flags,
178 struct page **pages,
179 struct bio *bio);
180
181extern void ceph_osdc_build_request(struct ceph_osd_request *req,
182 u64 off, u64 *plen,
183 struct ceph_osd_req_op *src_ops,
184 struct ceph_snap_context *snapc,
185 struct timespec *mtime,
186 const char *oid,
187 int oid_len);
188
122extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, 189extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
123 struct ceph_file_layout *layout, 190 struct ceph_file_layout *layout,
124 struct ceph_vino vino, 191 struct ceph_vino vino,
diff --git a/fs/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 970b547e510d..ba4c205cbb01 100644
--- a/fs/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -4,7 +4,7 @@
4#include <linux/rbtree.h> 4#include <linux/rbtree.h>
5#include "types.h" 5#include "types.h"
6#include "ceph_fs.h" 6#include "ceph_fs.h"
7#include "crush/crush.h" 7#include <linux/crush/crush.h>
8 8
9/* 9/*
10 * The osd map describes the current membership of the osd cluster and 10 * The osd map describes the current membership of the osd cluster and
@@ -125,4 +125,6 @@ extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
125extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, 125extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
126 struct ceph_pg pgid); 126 struct ceph_pg pgid);
127 127
128extern int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name);
129
128#endif 130#endif
diff --git a/fs/ceph/pagelist.h b/include/linux/ceph/pagelist.h
index e8a4187e1087..9660d6b0a35d 100644
--- a/fs/ceph/pagelist.h
+++ b/include/linux/ceph/pagelist.h
@@ -8,6 +8,14 @@ struct ceph_pagelist {
8 void *mapped_tail; 8 void *mapped_tail;
9 size_t length; 9 size_t length;
10 size_t room; 10 size_t room;
11 struct list_head free_list;
12 size_t num_pages_free;
13};
14
15struct ceph_pagelist_cursor {
16 struct ceph_pagelist *pl; /* pagelist, for error checking */
17 struct list_head *page_lru; /* page in list */
18 size_t room; /* room remaining to reset to */
11}; 19};
12 20
13static inline void ceph_pagelist_init(struct ceph_pagelist *pl) 21static inline void ceph_pagelist_init(struct ceph_pagelist *pl)
@@ -16,10 +24,23 @@ static inline void ceph_pagelist_init(struct ceph_pagelist *pl)
16 pl->mapped_tail = NULL; 24 pl->mapped_tail = NULL;
17 pl->length = 0; 25 pl->length = 0;
18 pl->room = 0; 26 pl->room = 0;
27 INIT_LIST_HEAD(&pl->free_list);
28 pl->num_pages_free = 0;
19} 29}
30
20extern int ceph_pagelist_release(struct ceph_pagelist *pl); 31extern int ceph_pagelist_release(struct ceph_pagelist *pl);
21 32
22extern int ceph_pagelist_append(struct ceph_pagelist *pl, void *d, size_t l); 33extern int ceph_pagelist_append(struct ceph_pagelist *pl, const void *d, size_t l);
34
35extern int ceph_pagelist_reserve(struct ceph_pagelist *pl, size_t space);
36
37extern int ceph_pagelist_free_reserve(struct ceph_pagelist *pl);
38
39extern void ceph_pagelist_set_cursor(struct ceph_pagelist *pl,
40 struct ceph_pagelist_cursor *c);
41
42extern int ceph_pagelist_truncate(struct ceph_pagelist *pl,
43 struct ceph_pagelist_cursor *c);
23 44
24static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v) 45static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v)
25{ 46{
diff --git a/fs/ceph/rados.h b/include/linux/ceph/rados.h
index 6d5247f2e81b..6d5247f2e81b 100644
--- a/fs/ceph/rados.h
+++ b/include/linux/ceph/rados.h
diff --git a/fs/ceph/types.h b/include/linux/ceph/types.h
index 28b35a005ec2..28b35a005ec2 100644
--- a/fs/ceph/types.h
+++ b/include/linux/ceph/types.h
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 0c991023ee47..709dfb901d11 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -75,7 +75,7 @@ struct cgroup_subsys_state {
75 75
76 unsigned long flags; 76 unsigned long flags;
77 /* ID for this css, if possible */ 77 /* ID for this css, if possible */
78 struct css_id *id; 78 struct css_id __rcu *id;
79}; 79};
80 80
81/* bits in struct cgroup_subsys_state flags field */ 81/* bits in struct cgroup_subsys_state flags field */
@@ -205,7 +205,7 @@ struct cgroup {
205 struct list_head children; /* my children */ 205 struct list_head children; /* my children */
206 206
207 struct cgroup *parent; /* my parent */ 207 struct cgroup *parent; /* my parent */
208 struct dentry *dentry; /* cgroup fs entry, RCU protected */ 208 struct dentry __rcu *dentry; /* cgroup fs entry, RCU protected */
209 209
210 /* Private pointers for each registered subsystem */ 210 /* Private pointers for each registered subsystem */
211 struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; 211 struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index c1a62c56a660..320d6c94ff84 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -16,7 +16,11 @@
16# define __release(x) __context__(x,-1) 16# define __release(x) __context__(x,-1)
17# define __cond_lock(x,c) ((c) ? ({ __acquire(x); 1; }) : 0) 17# define __cond_lock(x,c) ((c) ? ({ __acquire(x); 1; }) : 0)
18# define __percpu __attribute__((noderef, address_space(3))) 18# define __percpu __attribute__((noderef, address_space(3)))
19#ifdef CONFIG_SPARSE_RCU_POINTER
20# define __rcu __attribute__((noderef, address_space(4)))
21#else
19# define __rcu 22# define __rcu
23#endif
20extern void __chk_user_ptr(const volatile void __user *); 24extern void __chk_user_ptr(const volatile void __user *);
21extern void __chk_io_ptr(const volatile void __iomem *); 25extern void __chk_io_ptr(const volatile void __iomem *);
22#else 26#else
diff --git a/include/linux/cred.h b/include/linux/cred.h
index 4d2c39573f36..4aaeab376446 100644
--- a/include/linux/cred.h
+++ b/include/linux/cred.h
@@ -84,7 +84,7 @@ struct thread_group_cred {
84 atomic_t usage; 84 atomic_t usage;
85 pid_t tgid; /* thread group process ID */ 85 pid_t tgid; /* thread group process ID */
86 spinlock_t lock; 86 spinlock_t lock;
87 struct key *session_keyring; /* keyring inherited over fork */ 87 struct key __rcu *session_keyring; /* keyring inherited over fork */
88 struct key *process_keyring; /* keyring private to this process */ 88 struct key *process_keyring; /* keyring private to this process */
89 struct rcu_head rcu; /* RCU deletion hook */ 89 struct rcu_head rcu; /* RCU deletion hook */
90}; 90};
diff --git a/fs/ceph/crush/crush.h b/include/linux/crush/crush.h
index 97e435b191f4..97e435b191f4 100644
--- a/fs/ceph/crush/crush.h
+++ b/include/linux/crush/crush.h
diff --git a/fs/ceph/crush/hash.h b/include/linux/crush/hash.h
index 91e884230d5d..91e884230d5d 100644
--- a/fs/ceph/crush/hash.h
+++ b/include/linux/crush/hash.h
diff --git a/fs/ceph/crush/mapper.h b/include/linux/crush/mapper.h
index c46b99c18bb0..c46b99c18bb0 100644
--- a/fs/ceph/crush/mapper.h
+++ b/include/linux/crush/mapper.h
diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h
index 29b3ce3f2a1d..2833452ea01c 100644
--- a/include/linux/debug_locks.h
+++ b/include/linux/debug_locks.h
@@ -49,7 +49,6 @@ struct task_struct;
49 49
50#ifdef CONFIG_LOCKDEP 50#ifdef CONFIG_LOCKDEP
51extern void debug_show_all_locks(void); 51extern void debug_show_all_locks(void);
52extern void __debug_show_held_locks(struct task_struct *task);
53extern void debug_show_held_locks(struct task_struct *task); 52extern void debug_show_held_locks(struct task_struct *task);
54extern void debug_check_no_locks_freed(const void *from, unsigned long len); 53extern void debug_check_no_locks_freed(const void *from, unsigned long len);
55extern void debug_check_no_locks_held(struct task_struct *task); 54extern void debug_check_no_locks_held(struct task_struct *task);
@@ -58,10 +57,6 @@ static inline void debug_show_all_locks(void)
58{ 57{
59} 58}
60 59
61static inline void __debug_show_held_locks(struct task_struct *task)
62{
63}
64
65static inline void debug_show_held_locks(struct task_struct *task) 60static inline void debug_show_held_locks(struct task_struct *task)
66{ 61{
67} 62}
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index d7cecc90ed34..51651b76d40f 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -106,6 +106,7 @@ struct irte {
106 __u64 high; 106 __u64 high;
107 }; 107 };
108}; 108};
109
109#ifdef CONFIG_INTR_REMAP 110#ifdef CONFIG_INTR_REMAP
110extern int intr_remapping_enabled; 111extern int intr_remapping_enabled;
111extern int intr_remapping_supported(void); 112extern int intr_remapping_supported(void);
@@ -119,11 +120,8 @@ extern int alloc_irte(struct intel_iommu *iommu, int irq, u16 count);
119extern int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index, 120extern int set_irte_irq(int irq, struct intel_iommu *iommu, u16 index,
120 u16 sub_handle); 121 u16 sub_handle);
121extern int map_irq_to_irte_handle(int irq, u16 *sub_handle); 122extern int map_irq_to_irte_handle(int irq, u16 *sub_handle);
122extern int clear_irte_irq(int irq, struct intel_iommu *iommu, u16 index);
123extern int flush_irte(int irq);
124extern int free_irte(int irq); 123extern int free_irte(int irq);
125 124
126extern int irq_remapped(int irq);
127extern struct intel_iommu *map_dev_to_ir(struct pci_dev *dev); 125extern struct intel_iommu *map_dev_to_ir(struct pci_dev *dev);
128extern struct intel_iommu *map_ioapic_to_ir(int apic); 126extern struct intel_iommu *map_ioapic_to_ir(int apic);
129extern struct intel_iommu *map_hpet_to_ir(u8 id); 127extern struct intel_iommu *map_hpet_to_ir(u8 id);
@@ -177,7 +175,6 @@ static inline int set_msi_sid(struct irte *irte, struct pci_dev *dev)
177 return 0; 175 return 0;
178} 176}
179 177
180#define irq_remapped(irq) (0)
181#define enable_intr_remapping(mode) (-1) 178#define enable_intr_remapping(mode) (-1)
182#define disable_intr_remapping() (0) 179#define disable_intr_remapping() (0)
183#define reenable_intr_remapping(mode) (0) 180#define reenable_intr_remapping(mode) (0)
@@ -187,8 +184,9 @@ static inline int set_msi_sid(struct irte *irte, struct pci_dev *dev)
187/* Can't use the common MSI interrupt functions 184/* Can't use the common MSI interrupt functions
188 * since DMAR is not a pci device 185 * since DMAR is not a pci device
189 */ 186 */
190extern void dmar_msi_unmask(unsigned int irq); 187struct irq_data;
191extern void dmar_msi_mask(unsigned int irq); 188extern void dmar_msi_unmask(struct irq_data *data);
189extern void dmar_msi_mask(struct irq_data *data);
192extern void dmar_msi_read(int irq, struct msi_msg *msg); 190extern void dmar_msi_read(int irq, struct msi_msg *msg);
193extern void dmar_msi_write(int irq, struct msi_msg *msg); 191extern void dmar_msi_write(int irq, struct msi_msg *msg);
194extern int dmar_set_interrupt(struct intel_iommu *iommu); 192extern int dmar_set_interrupt(struct intel_iommu *iommu);
diff --git a/include/linux/edac.h b/include/linux/edac.h
index 7cf92e8a4196..36c66443bdfd 100644
--- a/include/linux/edac.h
+++ b/include/linux/edac.h
@@ -13,6 +13,7 @@
13#define _LINUX_EDAC_H_ 13#define _LINUX_EDAC_H_
14 14
15#include <asm/atomic.h> 15#include <asm/atomic.h>
16#include <linux/sysdev.h>
16 17
17#define EDAC_OPSTATE_INVAL -1 18#define EDAC_OPSTATE_INVAL -1
18#define EDAC_OPSTATE_POLL 0 19#define EDAC_OPSTATE_POLL 0
@@ -22,9 +23,12 @@
22extern int edac_op_state; 23extern int edac_op_state;
23extern int edac_err_assert; 24extern int edac_err_assert;
24extern atomic_t edac_handlers; 25extern atomic_t edac_handlers;
26extern struct sysdev_class edac_class;
25 27
26extern int edac_handler_set(void); 28extern int edac_handler_set(void);
27extern void edac_atomic_assert_error(void); 29extern void edac_atomic_assert_error(void);
30extern struct sysdev_class *edac_get_sysfs_class(void);
31extern void edac_put_sysfs_class(void);
28 32
29static inline void opstate_init(void) 33static inline void opstate_init(void)
30{ 34{
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index f59ed297b661..133c0ba25e30 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -31,7 +31,7 @@ struct embedded_fd_set {
31 31
32struct fdtable { 32struct fdtable {
33 unsigned int max_fds; 33 unsigned int max_fds;
34 struct file ** fd; /* current fd array */ 34 struct file __rcu **fd; /* current fd array */
35 fd_set *close_on_exec; 35 fd_set *close_on_exec;
36 fd_set *open_fds; 36 fd_set *open_fds;
37 struct rcu_head rcu; 37 struct rcu_head rcu;
@@ -46,7 +46,7 @@ struct files_struct {
46 * read mostly part 46 * read mostly part
47 */ 47 */
48 atomic_t count; 48 atomic_t count;
49 struct fdtable *fdt; 49 struct fdtable __rcu *fdt;
50 struct fdtable fdtab; 50 struct fdtable fdtab;
51 /* 51 /*
52 * written part on a separate cache line in SMP 52 * written part on a separate cache line in SMP
@@ -55,7 +55,7 @@ struct files_struct {
55 int next_fd; 55 int next_fd;
56 struct embedded_fd_set close_on_exec_init; 56 struct embedded_fd_set close_on_exec_init;
57 struct embedded_fd_set open_fds_init; 57 struct embedded_fd_set open_fds_init;
58 struct file * fd_array[NR_OPEN_DEFAULT]; 58 struct file __rcu * fd_array[NR_OPEN_DEFAULT];
59}; 59};
60 60
61#define rcu_dereference_check_fdtable(files, fdtfd) \ 61#define rcu_dereference_check_fdtable(files, fdtfd) \
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 63d069bd80b7..3168dcfb94f2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1384,7 +1384,7 @@ struct super_block {
1384 * Saved mount options for lazy filesystems using 1384 * Saved mount options for lazy filesystems using
1385 * generic_show_options() 1385 * generic_show_options()
1386 */ 1386 */
1387 char *s_options; 1387 char __rcu *s_options;
1388}; 1388};
1389 1389
1390extern struct timespec current_fs_time(struct super_block *sb); 1390extern struct timespec current_fs_time(struct super_block *sb);
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 5f2f4c4d8fb0..af3f06b41dc1 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -129,8 +129,8 @@ struct blk_scsi_cmd_filter {
129struct disk_part_tbl { 129struct disk_part_tbl {
130 struct rcu_head rcu_head; 130 struct rcu_head rcu_head;
131 int len; 131 int len;
132 struct hd_struct *last_lookup; 132 struct hd_struct __rcu *last_lookup;
133 struct hd_struct *part[]; 133 struct hd_struct __rcu *part[];
134}; 134};
135 135
136struct gendisk { 136struct gendisk {
@@ -149,7 +149,7 @@ struct gendisk {
149 * non-critical accesses use RCU. Always access through 149 * non-critical accesses use RCU. Always access through
150 * helpers. 150 * helpers.
151 */ 151 */
152 struct disk_part_tbl *part_tbl; 152 struct disk_part_tbl __rcu *part_tbl;
153 struct hd_struct part0; 153 struct hd_struct part0;
154 154
155 const struct block_device_operations *fops; 155 const struct block_device_operations *fops;
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index d5b387669dab..96c323ac44df 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -64,6 +64,8 @@
64#define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT) 64#define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
65#define NMI_OFFSET (1UL << NMI_SHIFT) 65#define NMI_OFFSET (1UL << NMI_SHIFT)
66 66
67#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
68
67#ifndef PREEMPT_ACTIVE 69#ifndef PREEMPT_ACTIVE
68#define PREEMPT_ACTIVE_BITS 1 70#define PREEMPT_ACTIVE_BITS 1
69#define PREEMPT_ACTIVE_SHIFT (NMI_SHIFT + NMI_BITS) 71#define PREEMPT_ACTIVE_SHIFT (NMI_SHIFT + NMI_BITS)
@@ -82,10 +84,13 @@
82/* 84/*
83 * Are we doing bottom half or hardware interrupt processing? 85 * Are we doing bottom half or hardware interrupt processing?
84 * Are we in a softirq context? Interrupt context? 86 * Are we in a softirq context? Interrupt context?
87 * in_softirq - Are we currently processing softirq or have bh disabled?
88 * in_serving_softirq - Are we currently processing softirq?
85 */ 89 */
86#define in_irq() (hardirq_count()) 90#define in_irq() (hardirq_count())
87#define in_softirq() (softirq_count()) 91#define in_softirq() (softirq_count())
88#define in_interrupt() (irq_count()) 92#define in_interrupt() (irq_count())
93#define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET)
89 94
90/* 95/*
91 * Are we in NMI context? 96 * Are we in NMI context?
@@ -132,14 +137,16 @@ extern void synchronize_irq(unsigned int irq);
132 137
133struct task_struct; 138struct task_struct;
134 139
135#ifndef CONFIG_VIRT_CPU_ACCOUNTING 140#if !defined(CONFIG_VIRT_CPU_ACCOUNTING) && !defined(CONFIG_IRQ_TIME_ACCOUNTING)
136static inline void account_system_vtime(struct task_struct *tsk) 141static inline void account_system_vtime(struct task_struct *tsk)
137{ 142{
138} 143}
144#else
145extern void account_system_vtime(struct task_struct *tsk);
139#endif 146#endif
140 147
141#if defined(CONFIG_NO_HZ) 148#if defined(CONFIG_NO_HZ)
142#if defined(CONFIG_TINY_RCU) 149#if defined(CONFIG_TINY_RCU) || defined(CONFIG_TINY_PREEMPT_RCU)
143extern void rcu_enter_nohz(void); 150extern void rcu_enter_nohz(void);
144extern void rcu_exit_nohz(void); 151extern void rcu_exit_nohz(void);
145 152
diff --git a/include/linux/htirq.h b/include/linux/htirq.h
index c96ea46737d0..70a1dbbf2093 100644
--- a/include/linux/htirq.h
+++ b/include/linux/htirq.h
@@ -9,8 +9,9 @@ struct ht_irq_msg {
9/* Helper functions.. */ 9/* Helper functions.. */
10void fetch_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg); 10void fetch_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg);
11void write_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg); 11void write_ht_irq_msg(unsigned int irq, struct ht_irq_msg *msg);
12void mask_ht_irq(unsigned int irq); 12struct irq_data;
13void unmask_ht_irq(unsigned int irq); 13void mask_ht_irq(struct irq_data *data);
14void unmask_ht_irq(struct irq_data *data);
14 15
15/* The arch hook for getting things started */ 16/* The arch hook for getting things started */
16int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev); 17int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev);
diff --git a/include/linux/idr.h b/include/linux/idr.h
index e968db71e33a..cdb715e58e3e 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -50,14 +50,14 @@
50 50
51struct idr_layer { 51struct idr_layer {
52 unsigned long bitmap; /* A zero bit means "space here" */ 52 unsigned long bitmap; /* A zero bit means "space here" */
53 struct idr_layer *ary[1<<IDR_BITS]; 53 struct idr_layer __rcu *ary[1<<IDR_BITS];
54 int count; /* When zero, we can release it */ 54 int count; /* When zero, we can release it */
55 int layer; /* distance from leaf */ 55 int layer; /* distance from leaf */
56 struct rcu_head rcu_head; 56 struct rcu_head rcu_head;
57}; 57};
58 58
59struct idr { 59struct idr {
60 struct idr_layer *top; 60 struct idr_layer __rcu *top;
61 struct idr_layer *id_free; 61 struct idr_layer *id_free;
62 int layers; /* only valid without concurrent changes */ 62 int layers; /* only valid without concurrent changes */
63 int id_free_cnt; 63 int id_free_cnt;
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 1f43fa56f600..2fea6c8ef6ba 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -82,11 +82,17 @@ extern struct group_info init_groups;
82# define CAP_INIT_BSET CAP_FULL_SET 82# define CAP_INIT_BSET CAP_FULL_SET
83 83
84#ifdef CONFIG_TREE_PREEMPT_RCU 84#ifdef CONFIG_TREE_PREEMPT_RCU
85#define INIT_TASK_RCU_TREE_PREEMPT() \
86 .rcu_blocked_node = NULL,
87#else
88#define INIT_TASK_RCU_TREE_PREEMPT(tsk)
89#endif
90#ifdef CONFIG_PREEMPT_RCU
85#define INIT_TASK_RCU_PREEMPT(tsk) \ 91#define INIT_TASK_RCU_PREEMPT(tsk) \
86 .rcu_read_lock_nesting = 0, \ 92 .rcu_read_lock_nesting = 0, \
87 .rcu_read_unlock_special = 0, \ 93 .rcu_read_unlock_special = 0, \
88 .rcu_blocked_node = NULL, \ 94 .rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry), \
89 .rcu_node_entry = LIST_HEAD_INIT(tsk.rcu_node_entry), 95 INIT_TASK_RCU_TREE_PREEMPT()
90#else 96#else
91#define INIT_TASK_RCU_PREEMPT(tsk) 97#define INIT_TASK_RCU_PREEMPT(tsk)
92#endif 98#endif
@@ -137,8 +143,8 @@ extern struct cred init_cred;
137 .children = LIST_HEAD_INIT(tsk.children), \ 143 .children = LIST_HEAD_INIT(tsk.children), \
138 .sibling = LIST_HEAD_INIT(tsk.sibling), \ 144 .sibling = LIST_HEAD_INIT(tsk.sibling), \
139 .group_leader = &tsk, \ 145 .group_leader = &tsk, \
140 .real_cred = &init_cred, \ 146 RCU_INIT_POINTER(.real_cred, &init_cred), \
141 .cred = &init_cred, \ 147 RCU_INIT_POINTER(.cred, &init_cred), \
142 .cred_guard_mutex = \ 148 .cred_guard_mutex = \
143 __MUTEX_INITIALIZER(tsk.cred_guard_mutex), \ 149 __MUTEX_INITIALIZER(tsk.cred_guard_mutex), \
144 .comm = "swapper", \ 150 .comm = "swapper", \
diff --git a/include/linux/input.h b/include/linux/input.h
index 896a92227bc4..d6ae1761be97 100644
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -1196,7 +1196,7 @@ struct input_dev {
1196 int (*flush)(struct input_dev *dev, struct file *file); 1196 int (*flush)(struct input_dev *dev, struct file *file);
1197 int (*event)(struct input_dev *dev, unsigned int type, unsigned int code, int value); 1197 int (*event)(struct input_dev *dev, unsigned int type, unsigned int code, int value);
1198 1198
1199 struct input_handle *grab; 1199 struct input_handle __rcu *grab;
1200 1200
1201 spinlock_t event_lock; 1201 spinlock_t event_lock;
1202 struct mutex mutex; 1202 struct mutex mutex;
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 531495db1708..414328577ced 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -647,11 +647,8 @@ static inline void init_irq_proc(void)
647struct seq_file; 647struct seq_file;
648int show_interrupts(struct seq_file *p, void *v); 648int show_interrupts(struct seq_file *p, void *v);
649 649
650struct irq_desc;
651
652extern int early_irq_init(void); 650extern int early_irq_init(void);
653extern int arch_probe_nr_irqs(void); 651extern int arch_probe_nr_irqs(void);
654extern int arch_early_irq_init(void); 652extern int arch_early_irq_init(void);
655extern int arch_init_chip_data(struct irq_desc *desc, int node);
656 653
657#endif 654#endif
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index 64d529133031..3e70b21884a9 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -53,7 +53,7 @@ struct io_context {
53 53
54 struct radix_tree_root radix_root; 54 struct radix_tree_root radix_root;
55 struct hlist_head cic_list; 55 struct hlist_head cic_list;
56 void *ioc_data; 56 void __rcu *ioc_data;
57}; 57};
58 58
59static inline struct io_context *ioc_task_link(struct io_context *ioc) 59static inline struct io_context *ioc_task_link(struct io_context *ioc)
diff --git a/include/linux/irq.h b/include/linux/irq.h
index c03243ad84b4..e9639115dff1 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -72,6 +72,10 @@ typedef void (*irq_flow_handler_t)(unsigned int irq,
72#define IRQ_ONESHOT 0x08000000 /* IRQ is not unmasked after hardirq */ 72#define IRQ_ONESHOT 0x08000000 /* IRQ is not unmasked after hardirq */
73#define IRQ_NESTED_THREAD 0x10000000 /* IRQ is nested into another, no own handler thread */ 73#define IRQ_NESTED_THREAD 0x10000000 /* IRQ is nested into another, no own handler thread */
74 74
75#define IRQF_MODIFY_MASK \
76 (IRQ_TYPE_SENSE_MASK | IRQ_NOPROBE | IRQ_NOREQUEST | \
77 IRQ_NOAUTOEN | IRQ_MOVE_PCNTXT | IRQ_LEVEL)
78
75#ifdef CONFIG_IRQ_PER_CPU 79#ifdef CONFIG_IRQ_PER_CPU
76# define CHECK_IRQ_PER_CPU(var) ((var) & IRQ_PER_CPU) 80# define CHECK_IRQ_PER_CPU(var) ((var) & IRQ_PER_CPU)
77# define IRQ_NO_BALANCING_MASK (IRQ_PER_CPU | IRQ_NO_BALANCING) 81# define IRQ_NO_BALANCING_MASK (IRQ_PER_CPU | IRQ_NO_BALANCING)
@@ -80,36 +84,77 @@ typedef void (*irq_flow_handler_t)(unsigned int irq,
80# define IRQ_NO_BALANCING_MASK IRQ_NO_BALANCING 84# define IRQ_NO_BALANCING_MASK IRQ_NO_BALANCING
81#endif 85#endif
82 86
83struct proc_dir_entry;
84struct msi_desc; 87struct msi_desc;
85 88
86/** 89/**
90 * struct irq_data - per irq and irq chip data passed down to chip functions
91 * @irq: interrupt number
92 * @node: node index useful for balancing
93 * @chip: low level interrupt hardware access
94 * @handler_data: per-IRQ data for the irq_chip methods
95 * @chip_data: platform-specific per-chip private data for the chip
96 * methods, to allow shared chip implementations
97 * @msi_desc: MSI descriptor
98 * @affinity: IRQ affinity on SMP
99 *
100 * The fields here need to overlay the ones in irq_desc until we
101 * cleaned up the direct references and switched everything over to
102 * irq_data.
103 */
104struct irq_data {
105 unsigned int irq;
106 unsigned int node;
107 struct irq_chip *chip;
108 void *handler_data;
109 void *chip_data;
110 struct msi_desc *msi_desc;
111#ifdef CONFIG_SMP
112 cpumask_var_t affinity;
113#endif
114};
115
116/**
87 * struct irq_chip - hardware interrupt chip descriptor 117 * struct irq_chip - hardware interrupt chip descriptor
88 * 118 *
89 * @name: name for /proc/interrupts 119 * @name: name for /proc/interrupts
90 * @startup: start up the interrupt (defaults to ->enable if NULL) 120 * @startup: deprecated, replaced by irq_startup
91 * @shutdown: shut down the interrupt (defaults to ->disable if NULL) 121 * @shutdown: deprecated, replaced by irq_shutdown
92 * @enable: enable the interrupt (defaults to chip->unmask if NULL) 122 * @enable: deprecated, replaced by irq_enable
93 * @disable: disable the interrupt 123 * @disable: deprecated, replaced by irq_disable
94 * @ack: start of a new interrupt 124 * @ack: deprecated, replaced by irq_ack
95 * @mask: mask an interrupt source 125 * @mask: deprecated, replaced by irq_mask
96 * @mask_ack: ack and mask an interrupt source 126 * @mask_ack: deprecated, replaced by irq_mask_ack
97 * @unmask: unmask an interrupt source 127 * @unmask: deprecated, replaced by irq_unmask
98 * @eoi: end of interrupt - chip level 128 * @eoi: deprecated, replaced by irq_eoi
99 * @end: end of interrupt - flow level 129 * @end: deprecated, will go away with __do_IRQ()
100 * @set_affinity: set the CPU affinity on SMP machines 130 * @set_affinity: deprecated, replaced by irq_set_affinity
101 * @retrigger: resend an IRQ to the CPU 131 * @retrigger: deprecated, replaced by irq_retrigger
102 * @set_type: set the flow type (IRQ_TYPE_LEVEL/etc.) of an IRQ 132 * @set_type: deprecated, replaced by irq_set_type
103 * @set_wake: enable/disable power-management wake-on of an IRQ 133 * @set_wake: deprecated, replaced by irq_wake
134 * @bus_lock: deprecated, replaced by irq_bus_lock
135 * @bus_sync_unlock: deprecated, replaced by irq_bus_sync_unlock
104 * 136 *
105 * @bus_lock: function to lock access to slow bus (i2c) chips 137 * @irq_startup: start up the interrupt (defaults to ->enable if NULL)
106 * @bus_sync_unlock: function to sync and unlock slow bus (i2c) chips 138 * @irq_shutdown: shut down the interrupt (defaults to ->disable if NULL)
139 * @irq_enable: enable the interrupt (defaults to chip->unmask if NULL)
140 * @irq_disable: disable the interrupt
141 * @irq_ack: start of a new interrupt
142 * @irq_mask: mask an interrupt source
143 * @irq_mask_ack: ack and mask an interrupt source
144 * @irq_unmask: unmask an interrupt source
145 * @irq_eoi: end of interrupt
146 * @irq_set_affinity: set the CPU affinity on SMP machines
147 * @irq_retrigger: resend an IRQ to the CPU
148 * @irq_set_type: set the flow type (IRQ_TYPE_LEVEL/etc.) of an IRQ
149 * @irq_set_wake: enable/disable power-management wake-on of an IRQ
150 * @irq_bus_lock: function to lock access to slow bus (i2c) chips
151 * @irq_bus_sync_unlock:function to sync and unlock slow bus (i2c) chips
107 * 152 *
108 * @release: release function solely used by UML 153 * @release: release function solely used by UML
109 * @typename: obsoleted by name, kept as migration helper
110 */ 154 */
111struct irq_chip { 155struct irq_chip {
112 const char *name; 156 const char *name;
157#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
113 unsigned int (*startup)(unsigned int irq); 158 unsigned int (*startup)(unsigned int irq);
114 void (*shutdown)(unsigned int irq); 159 void (*shutdown)(unsigned int irq);
115 void (*enable)(unsigned int irq); 160 void (*enable)(unsigned int irq);
@@ -130,154 +175,66 @@ struct irq_chip {
130 175
131 void (*bus_lock)(unsigned int irq); 176 void (*bus_lock)(unsigned int irq);
132 void (*bus_sync_unlock)(unsigned int irq); 177 void (*bus_sync_unlock)(unsigned int irq);
178#endif
179 unsigned int (*irq_startup)(struct irq_data *data);
180 void (*irq_shutdown)(struct irq_data *data);
181 void (*irq_enable)(struct irq_data *data);
182 void (*irq_disable)(struct irq_data *data);
183
184 void (*irq_ack)(struct irq_data *data);
185 void (*irq_mask)(struct irq_data *data);
186 void (*irq_mask_ack)(struct irq_data *data);
187 void (*irq_unmask)(struct irq_data *data);
188 void (*irq_eoi)(struct irq_data *data);
189
190 int (*irq_set_affinity)(struct irq_data *data, const struct cpumask *dest, bool force);
191 int (*irq_retrigger)(struct irq_data *data);
192 int (*irq_set_type)(struct irq_data *data, unsigned int flow_type);
193 int (*irq_set_wake)(struct irq_data *data, unsigned int on);
194
195 void (*irq_bus_lock)(struct irq_data *data);
196 void (*irq_bus_sync_unlock)(struct irq_data *data);
133 197
134 /* Currently used only by UML, might disappear one day.*/ 198 /* Currently used only by UML, might disappear one day.*/
135#ifdef CONFIG_IRQ_RELEASE_METHOD 199#ifdef CONFIG_IRQ_RELEASE_METHOD
136 void (*release)(unsigned int irq, void *dev_id); 200 void (*release)(unsigned int irq, void *dev_id);
137#endif 201#endif
138 /*
139 * For compatibility, ->typename is copied into ->name.
140 * Will disappear.
141 */
142 const char *typename;
143}; 202};
144 203
145struct timer_rand_state; 204/* This include will go away once we isolated irq_desc usage to core code */
146struct irq_2_iommu; 205#include <linux/irqdesc.h>
147/**
148 * struct irq_desc - interrupt descriptor
149 * @irq: interrupt number for this descriptor
150 * @timer_rand_state: pointer to timer rand state struct
151 * @kstat_irqs: irq stats per cpu
152 * @irq_2_iommu: iommu with this irq
153 * @handle_irq: highlevel irq-events handler [if NULL, __do_IRQ()]
154 * @chip: low level interrupt hardware access
155 * @msi_desc: MSI descriptor
156 * @handler_data: per-IRQ data for the irq_chip methods
157 * @chip_data: platform-specific per-chip private data for the chip
158 * methods, to allow shared chip implementations
159 * @action: the irq action chain
160 * @status: status information
161 * @depth: disable-depth, for nested irq_disable() calls
162 * @wake_depth: enable depth, for multiple set_irq_wake() callers
163 * @irq_count: stats field to detect stalled irqs
164 * @last_unhandled: aging timer for unhandled count
165 * @irqs_unhandled: stats field for spurious unhandled interrupts
166 * @lock: locking for SMP
167 * @affinity: IRQ affinity on SMP
168 * @node: node index useful for balancing
169 * @pending_mask: pending rebalanced interrupts
170 * @threads_active: number of irqaction threads currently running
171 * @wait_for_threads: wait queue for sync_irq to wait for threaded handlers
172 * @dir: /proc/irq/ procfs entry
173 * @name: flow handler name for /proc/interrupts output
174 */
175struct irq_desc {
176 unsigned int irq;
177 struct timer_rand_state *timer_rand_state;
178 unsigned int *kstat_irqs;
179#ifdef CONFIG_INTR_REMAP
180 struct irq_2_iommu *irq_2_iommu;
181#endif
182 irq_flow_handler_t handle_irq;
183 struct irq_chip *chip;
184 struct msi_desc *msi_desc;
185 void *handler_data;
186 void *chip_data;
187 struct irqaction *action; /* IRQ action list */
188 unsigned int status; /* IRQ status */
189
190 unsigned int depth; /* nested irq disables */
191 unsigned int wake_depth; /* nested wake enables */
192 unsigned int irq_count; /* For detecting broken IRQs */
193 unsigned long last_unhandled; /* Aging timer for unhandled count */
194 unsigned int irqs_unhandled;
195 raw_spinlock_t lock;
196#ifdef CONFIG_SMP
197 cpumask_var_t affinity;
198 const struct cpumask *affinity_hint;
199 unsigned int node;
200#ifdef CONFIG_GENERIC_PENDING_IRQ
201 cpumask_var_t pending_mask;
202#endif
203#endif
204 atomic_t threads_active;
205 wait_queue_head_t wait_for_threads;
206#ifdef CONFIG_PROC_FS
207 struct proc_dir_entry *dir;
208#endif
209 const char *name;
210} ____cacheline_internodealigned_in_smp;
211 206
212extern void arch_init_copy_chip_data(struct irq_desc *old_desc, 207/*
213 struct irq_desc *desc, int node); 208 * Pick up the arch-dependent methods:
214extern void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc); 209 */
210#include <asm/hw_irq.h>
215 211
216#ifndef CONFIG_SPARSE_IRQ 212#ifndef NR_IRQS_LEGACY
217extern struct irq_desc irq_desc[NR_IRQS]; 213# define NR_IRQS_LEGACY 0
218#endif 214#endif
219 215
220#ifdef CONFIG_NUMA_IRQ_DESC 216#ifndef ARCH_IRQ_INIT_FLAGS
221extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int node); 217# define ARCH_IRQ_INIT_FLAGS 0
222#else
223static inline struct irq_desc *move_irq_desc(struct irq_desc *desc, int node)
224{
225 return desc;
226}
227#endif 218#endif
228 219
229extern struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node); 220#define IRQ_DEFAULT_INIT_FLAGS (IRQ_DISABLED | ARCH_IRQ_INIT_FLAGS)
230
231/*
232 * Pick up the arch-dependent methods:
233 */
234#include <asm/hw_irq.h>
235 221
222struct irqaction;
236extern int setup_irq(unsigned int irq, struct irqaction *new); 223extern int setup_irq(unsigned int irq, struct irqaction *new);
237extern void remove_irq(unsigned int irq, struct irqaction *act); 224extern void remove_irq(unsigned int irq, struct irqaction *act);
238 225
239#ifdef CONFIG_GENERIC_HARDIRQS 226#ifdef CONFIG_GENERIC_HARDIRQS
240 227
241#ifdef CONFIG_SMP 228#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_PENDING_IRQ)
242
243#ifdef CONFIG_GENERIC_PENDING_IRQ
244
245void move_native_irq(int irq); 229void move_native_irq(int irq);
246void move_masked_irq(int irq); 230void move_masked_irq(int irq);
247 231#else
248#else /* CONFIG_GENERIC_PENDING_IRQ */ 232static inline void move_native_irq(int irq) { }
249 233static inline void move_masked_irq(int irq) { }
250static inline void move_irq(int irq) 234#endif
251{
252}
253
254static inline void move_native_irq(int irq)
255{
256}
257
258static inline void move_masked_irq(int irq)
259{
260}
261
262#endif /* CONFIG_GENERIC_PENDING_IRQ */
263
264#else /* CONFIG_SMP */
265
266#define move_native_irq(x)
267#define move_masked_irq(x)
268
269#endif /* CONFIG_SMP */
270 235
271extern int no_irq_affinity; 236extern int no_irq_affinity;
272 237
273static inline int irq_balancing_disabled(unsigned int irq)
274{
275 struct irq_desc *desc;
276
277 desc = irq_to_desc(irq);
278 return desc->status & IRQ_NO_BALANCING_MASK;
279}
280
281/* Handle irq action chains: */ 238/* Handle irq action chains: */
282extern irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action); 239extern irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action);
283 240
@@ -293,42 +250,10 @@ extern void handle_percpu_irq(unsigned int irq, struct irq_desc *desc);
293extern void handle_bad_irq(unsigned int irq, struct irq_desc *desc); 250extern void handle_bad_irq(unsigned int irq, struct irq_desc *desc);
294extern void handle_nested_irq(unsigned int irq); 251extern void handle_nested_irq(unsigned int irq);
295 252
296/*
297 * Monolithic do_IRQ implementation.
298 */
299#ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
300extern unsigned int __do_IRQ(unsigned int irq);
301#endif
302
303/*
304 * Architectures call this to let the generic IRQ layer
305 * handle an interrupt. If the descriptor is attached to an
306 * irqchip-style controller then we call the ->handle_irq() handler,
307 * and it calls __do_IRQ() if it's attached to an irqtype-style controller.
308 */
309static inline void generic_handle_irq_desc(unsigned int irq, struct irq_desc *desc)
310{
311#ifdef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
312 desc->handle_irq(irq, desc);
313#else
314 if (likely(desc->handle_irq))
315 desc->handle_irq(irq, desc);
316 else
317 __do_IRQ(irq);
318#endif
319}
320
321static inline void generic_handle_irq(unsigned int irq)
322{
323 generic_handle_irq_desc(irq, irq_to_desc(irq));
324}
325
326/* Handling of unhandled and spurious interrupts: */ 253/* Handling of unhandled and spurious interrupts: */
327extern void note_interrupt(unsigned int irq, struct irq_desc *desc, 254extern void note_interrupt(unsigned int irq, struct irq_desc *desc,
328 irqreturn_t action_ret); 255 irqreturn_t action_ret);
329 256
330/* Resending of interrupts :*/
331void check_irq_resend(struct irq_desc *desc, unsigned int irq);
332 257
333/* Enable/disable irq debugging output: */ 258/* Enable/disable irq debugging output: */
334extern int noirqdebug_setup(char *str); 259extern int noirqdebug_setup(char *str);
@@ -351,16 +276,6 @@ extern void
351__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, 276__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
352 const char *name); 277 const char *name);
353 278
354/* caller has locked the irq_desc and both params are valid */
355static inline void __set_irq_handler_unlocked(int irq,
356 irq_flow_handler_t handler)
357{
358 struct irq_desc *desc;
359
360 desc = irq_to_desc(irq);
361 desc->handle_irq = handler;
362}
363
364/* 279/*
365 * Set a highlevel flow handler for a given IRQ: 280 * Set a highlevel flow handler for a given IRQ:
366 */ 281 */
@@ -384,141 +299,121 @@ set_irq_chained_handler(unsigned int irq,
384 299
385extern void set_irq_nested_thread(unsigned int irq, int nest); 300extern void set_irq_nested_thread(unsigned int irq, int nest);
386 301
387extern void set_irq_noprobe(unsigned int irq); 302void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set);
388extern void set_irq_probe(unsigned int irq); 303
304static inline void irq_set_status_flags(unsigned int irq, unsigned long set)
305{
306 irq_modify_status(irq, 0, set);
307}
308
309static inline void irq_clear_status_flags(unsigned int irq, unsigned long clr)
310{
311 irq_modify_status(irq, clr, 0);
312}
313
314static inline void set_irq_noprobe(unsigned int irq)
315{
316 irq_modify_status(irq, 0, IRQ_NOPROBE);
317}
318
319static inline void set_irq_probe(unsigned int irq)
320{
321 irq_modify_status(irq, IRQ_NOPROBE, 0);
322}
389 323
390/* Handle dynamic irq creation and destruction */ 324/* Handle dynamic irq creation and destruction */
391extern unsigned int create_irq_nr(unsigned int irq_want, int node); 325extern unsigned int create_irq_nr(unsigned int irq_want, int node);
392extern int create_irq(void); 326extern int create_irq(void);
393extern void destroy_irq(unsigned int irq); 327extern void destroy_irq(unsigned int irq);
394 328
395/* Test to see if a driver has successfully requested an irq */ 329/*
396static inline int irq_has_action(unsigned int irq) 330 * Dynamic irq helper functions. Obsolete. Use irq_alloc_desc* and
331 * irq_free_desc instead.
332 */
333extern void dynamic_irq_cleanup(unsigned int irq);
334static inline void dynamic_irq_init(unsigned int irq)
397{ 335{
398 struct irq_desc *desc = irq_to_desc(irq); 336 dynamic_irq_cleanup(irq);
399 return desc->action != NULL;
400} 337}
401 338
402/* Dynamic irq helper functions */
403extern void dynamic_irq_init(unsigned int irq);
404void dynamic_irq_init_keep_chip_data(unsigned int irq);
405extern void dynamic_irq_cleanup(unsigned int irq);
406void dynamic_irq_cleanup_keep_chip_data(unsigned int irq);
407
408/* Set/get chip/data for an IRQ: */ 339/* Set/get chip/data for an IRQ: */
409extern int set_irq_chip(unsigned int irq, struct irq_chip *chip); 340extern int set_irq_chip(unsigned int irq, struct irq_chip *chip);
410extern int set_irq_data(unsigned int irq, void *data); 341extern int set_irq_data(unsigned int irq, void *data);
411extern int set_irq_chip_data(unsigned int irq, void *data); 342extern int set_irq_chip_data(unsigned int irq, void *data);
412extern int set_irq_type(unsigned int irq, unsigned int type); 343extern int set_irq_type(unsigned int irq, unsigned int type);
413extern int set_irq_msi(unsigned int irq, struct msi_desc *entry); 344extern int set_irq_msi(unsigned int irq, struct msi_desc *entry);
345extern struct irq_data *irq_get_irq_data(unsigned int irq);
414 346
415#define get_irq_chip(irq) (irq_to_desc(irq)->chip) 347static inline struct irq_chip *get_irq_chip(unsigned int irq)
416#define get_irq_chip_data(irq) (irq_to_desc(irq)->chip_data)
417#define get_irq_data(irq) (irq_to_desc(irq)->handler_data)
418#define get_irq_msi(irq) (irq_to_desc(irq)->msi_desc)
419
420#define get_irq_desc_chip(desc) ((desc)->chip)
421#define get_irq_desc_chip_data(desc) ((desc)->chip_data)
422#define get_irq_desc_data(desc) ((desc)->handler_data)
423#define get_irq_desc_msi(desc) ((desc)->msi_desc)
424
425#endif /* CONFIG_GENERIC_HARDIRQS */
426
427#endif /* !CONFIG_S390 */
428
429#ifdef CONFIG_SMP
430/**
431 * alloc_desc_masks - allocate cpumasks for irq_desc
432 * @desc: pointer to irq_desc struct
433 * @node: node which will be handling the cpumasks
434 * @boot: true if need bootmem
435 *
436 * Allocates affinity and pending_mask cpumask if required.
437 * Returns true if successful (or not required).
438 */
439static inline bool alloc_desc_masks(struct irq_desc *desc, int node,
440 bool boot)
441{ 348{
442 gfp_t gfp = GFP_ATOMIC; 349 struct irq_data *d = irq_get_irq_data(irq);
443 350 return d ? d->chip : NULL;
444 if (boot) 351}
445 gfp = GFP_NOWAIT;
446
447#ifdef CONFIG_CPUMASK_OFFSTACK
448 if (!alloc_cpumask_var_node(&desc->affinity, gfp, node))
449 return false;
450 352
451#ifdef CONFIG_GENERIC_PENDING_IRQ 353static inline struct irq_chip *irq_data_get_irq_chip(struct irq_data *d)
452 if (!alloc_cpumask_var_node(&desc->pending_mask, gfp, node)) { 354{
453 free_cpumask_var(desc->affinity); 355 return d->chip;
454 return false;
455 }
456#endif
457#endif
458 return true;
459} 356}
460 357
461static inline void init_desc_masks(struct irq_desc *desc) 358static inline void *get_irq_chip_data(unsigned int irq)
462{ 359{
463 cpumask_setall(desc->affinity); 360 struct irq_data *d = irq_get_irq_data(irq);
464#ifdef CONFIG_GENERIC_PENDING_IRQ 361 return d ? d->chip_data : NULL;
465 cpumask_clear(desc->pending_mask);
466#endif
467} 362}
468 363
469/** 364static inline void *irq_data_get_irq_chip_data(struct irq_data *d)
470 * init_copy_desc_masks - copy cpumasks for irq_desc 365{
471 * @old_desc: pointer to old irq_desc struct 366 return d->chip_data;
472 * @new_desc: pointer to new irq_desc struct 367}
473 *
474 * Insures affinity and pending_masks are copied to new irq_desc.
475 * If !CONFIG_CPUMASKS_OFFSTACK the cpumasks are embedded in the
476 * irq_desc struct so the copy is redundant.
477 */
478 368
479static inline void init_copy_desc_masks(struct irq_desc *old_desc, 369static inline void *get_irq_data(unsigned int irq)
480 struct irq_desc *new_desc)
481{ 370{
482#ifdef CONFIG_CPUMASK_OFFSTACK 371 struct irq_data *d = irq_get_irq_data(irq);
483 cpumask_copy(new_desc->affinity, old_desc->affinity); 372 return d ? d->handler_data : NULL;
373}
484 374
485#ifdef CONFIG_GENERIC_PENDING_IRQ 375static inline void *irq_data_get_irq_data(struct irq_data *d)
486 cpumask_copy(new_desc->pending_mask, old_desc->pending_mask); 376{
487#endif 377 return d->handler_data;
488#endif
489} 378}
490 379
491static inline void free_desc_masks(struct irq_desc *old_desc, 380static inline struct msi_desc *get_irq_msi(unsigned int irq)
492 struct irq_desc *new_desc)
493{ 381{
494 free_cpumask_var(old_desc->affinity); 382 struct irq_data *d = irq_get_irq_data(irq);
383 return d ? d->msi_desc : NULL;
384}
495 385
496#ifdef CONFIG_GENERIC_PENDING_IRQ 386static inline struct msi_desc *irq_data_get_msi(struct irq_data *d)
497 free_cpumask_var(old_desc->pending_mask); 387{
498#endif 388 return d->msi_desc;
499} 389}
500 390
501#else /* !CONFIG_SMP */ 391int irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node);
392void irq_free_descs(unsigned int irq, unsigned int cnt);
393int irq_reserve_irqs(unsigned int from, unsigned int cnt);
502 394
503static inline bool alloc_desc_masks(struct irq_desc *desc, int node, 395static inline int irq_alloc_desc(int node)
504 bool boot)
505{ 396{
506 return true; 397 return irq_alloc_descs(-1, 0, 1, node);
507} 398}
508 399
509static inline void init_desc_masks(struct irq_desc *desc) 400static inline int irq_alloc_desc_at(unsigned int at, int node)
510{ 401{
402 return irq_alloc_descs(at, at, 1, node);
511} 403}
512 404
513static inline void init_copy_desc_masks(struct irq_desc *old_desc, 405static inline int irq_alloc_desc_from(unsigned int from, int node)
514 struct irq_desc *new_desc)
515{ 406{
407 return irq_alloc_descs(-1, from, 1, node);
516} 408}
517 409
518static inline void free_desc_masks(struct irq_desc *old_desc, 410static inline void irq_free_desc(unsigned int irq)
519 struct irq_desc *new_desc)
520{ 411{
412 irq_free_descs(irq, 1);
521} 413}
522#endif /* CONFIG_SMP */ 414
415#endif /* CONFIG_GENERIC_HARDIRQS */
416
417#endif /* !CONFIG_S390 */
523 418
524#endif /* _LINUX_IRQ_H */ 419#endif /* _LINUX_IRQ_H */
diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h
new file mode 100644
index 000000000000..979c68cc7458
--- /dev/null
+++ b/include/linux/irqdesc.h
@@ -0,0 +1,159 @@
1#ifndef _LINUX_IRQDESC_H
2#define _LINUX_IRQDESC_H
3
4/*
5 * Core internal functions to deal with irq descriptors
6 *
7 * This include will move to kernel/irq once we cleaned up the tree.
8 * For now it's included from <linux/irq.h>
9 */
10
11struct proc_dir_entry;
12struct timer_rand_state;
13/**
14 * struct irq_desc - interrupt descriptor
15 * @irq_data: per irq and chip data passed down to chip functions
16 * @timer_rand_state: pointer to timer rand state struct
17 * @kstat_irqs: irq stats per cpu
18 * @handle_irq: highlevel irq-events handler [if NULL, __do_IRQ()]
19 * @action: the irq action chain
20 * @status: status information
21 * @depth: disable-depth, for nested irq_disable() calls
22 * @wake_depth: enable depth, for multiple set_irq_wake() callers
23 * @irq_count: stats field to detect stalled irqs
24 * @last_unhandled: aging timer for unhandled count
25 * @irqs_unhandled: stats field for spurious unhandled interrupts
26 * @lock: locking for SMP
27 * @pending_mask: pending rebalanced interrupts
28 * @threads_active: number of irqaction threads currently running
29 * @wait_for_threads: wait queue for sync_irq to wait for threaded handlers
30 * @dir: /proc/irq/ procfs entry
31 * @name: flow handler name for /proc/interrupts output
32 */
33struct irq_desc {
34
35#ifdef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
36 struct irq_data irq_data;
37#else
38 /*
39 * This union will go away, once we fixed the direct access to
40 * irq_desc all over the place. The direct fields are a 1:1
41 * overlay of irq_data.
42 */
43 union {
44 struct irq_data irq_data;
45 struct {
46 unsigned int irq;
47 unsigned int node;
48 struct irq_chip *chip;
49 void *handler_data;
50 void *chip_data;
51 struct msi_desc *msi_desc;
52#ifdef CONFIG_SMP
53 cpumask_var_t affinity;
54#endif
55 };
56 };
57#endif
58
59 struct timer_rand_state *timer_rand_state;
60 unsigned int *kstat_irqs;
61 irq_flow_handler_t handle_irq;
62 struct irqaction *action; /* IRQ action list */
63 unsigned int status; /* IRQ status */
64
65 unsigned int depth; /* nested irq disables */
66 unsigned int wake_depth; /* nested wake enables */
67 unsigned int irq_count; /* For detecting broken IRQs */
68 unsigned long last_unhandled; /* Aging timer for unhandled count */
69 unsigned int irqs_unhandled;
70 raw_spinlock_t lock;
71#ifdef CONFIG_SMP
72 const struct cpumask *affinity_hint;
73#ifdef CONFIG_GENERIC_PENDING_IRQ
74 cpumask_var_t pending_mask;
75#endif
76#endif
77 atomic_t threads_active;
78 wait_queue_head_t wait_for_threads;
79#ifdef CONFIG_PROC_FS
80 struct proc_dir_entry *dir;
81#endif
82 const char *name;
83} ____cacheline_internodealigned_in_smp;
84
85#ifndef CONFIG_SPARSE_IRQ
86extern struct irq_desc irq_desc[NR_IRQS];
87#endif
88
89/* Will be removed once the last users in power and sh are gone */
90extern struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node);
91static inline struct irq_desc *move_irq_desc(struct irq_desc *desc, int node)
92{
93 return desc;
94}
95
96#ifdef CONFIG_GENERIC_HARDIRQS
97
98#define get_irq_desc_chip(desc) ((desc)->irq_data.chip)
99#define get_irq_desc_chip_data(desc) ((desc)->irq_data.chip_data)
100#define get_irq_desc_data(desc) ((desc)->irq_data.handler_data)
101#define get_irq_desc_msi(desc) ((desc)->irq_data.msi_desc)
102
103/*
104 * Monolithic do_IRQ implementation.
105 */
106#ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
107extern unsigned int __do_IRQ(unsigned int irq);
108#endif
109
110/*
111 * Architectures call this to let the generic IRQ layer
112 * handle an interrupt. If the descriptor is attached to an
113 * irqchip-style controller then we call the ->handle_irq() handler,
114 * and it calls __do_IRQ() if it's attached to an irqtype-style controller.
115 */
116static inline void generic_handle_irq_desc(unsigned int irq, struct irq_desc *desc)
117{
118#ifdef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
119 desc->handle_irq(irq, desc);
120#else
121 if (likely(desc->handle_irq))
122 desc->handle_irq(irq, desc);
123 else
124 __do_IRQ(irq);
125#endif
126}
127
128static inline void generic_handle_irq(unsigned int irq)
129{
130 generic_handle_irq_desc(irq, irq_to_desc(irq));
131}
132
133/* Test to see if a driver has successfully requested an irq */
134static inline int irq_has_action(unsigned int irq)
135{
136 struct irq_desc *desc = irq_to_desc(irq);
137 return desc->action != NULL;
138}
139
140static inline int irq_balancing_disabled(unsigned int irq)
141{
142 struct irq_desc *desc;
143
144 desc = irq_to_desc(irq);
145 return desc->status & IRQ_NO_BALANCING_MASK;
146}
147
148/* caller has locked the irq_desc and both params are valid */
149static inline void __set_irq_handler_unlocked(int irq,
150 irq_flow_handler_t handler)
151{
152 struct irq_desc *desc;
153
154 desc = irq_to_desc(irq);
155 desc->handle_irq = handler;
156}
157#endif
158
159#endif
diff --git a/include/linux/irqnr.h b/include/linux/irqnr.h
index 7bf89bc8cbca..05aa8c23483f 100644
--- a/include/linux/irqnr.h
+++ b/include/linux/irqnr.h
@@ -25,6 +25,7 @@
25 25
26extern int nr_irqs; 26extern int nr_irqs;
27extern struct irq_desc *irq_to_desc(unsigned int irq); 27extern struct irq_desc *irq_to_desc(unsigned int irq);
28unsigned int irq_get_next_irq(unsigned int offset);
28 29
29# define for_each_irq_desc(irq, desc) \ 30# define for_each_irq_desc(irq, desc) \
30 for (irq = 0, desc = irq_to_desc(irq); irq < nr_irqs; \ 31 for (irq = 0, desc = irq_to_desc(irq); irq < nr_irqs; \
@@ -47,6 +48,10 @@ extern struct irq_desc *irq_to_desc(unsigned int irq);
47#define irq_node(irq) 0 48#define irq_node(irq) 0
48#endif 49#endif
49 50
51# define for_each_active_irq(irq) \
52 for (irq = irq_get_next_irq(0); irq < nr_irqs; \
53 irq = irq_get_next_irq(irq + 1))
54
50#endif /* CONFIG_GENERIC_HARDIRQS */ 55#endif /* CONFIG_GENERIC_HARDIRQS */
51 56
52#define for_each_irq_nr(irq) \ 57#define for_each_irq_nr(irq) \
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 2b0a35e6bc69..1759ba5adce8 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -58,7 +58,18 @@ extern const char linux_proc_banner[];
58 58
59#define FIELD_SIZEOF(t, f) (sizeof(((t*)0)->f)) 59#define FIELD_SIZEOF(t, f) (sizeof(((t*)0)->f))
60#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d)) 60#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
61#define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y)) 61#define roundup(x, y) ( \
62{ \
63 typeof(y) __y = y; \
64 (((x) + (__y - 1)) / __y) * __y; \
65} \
66)
67#define rounddown(x, y) ( \
68{ \
69 typeof(x) __x = (x); \
70 __x - (__x % (y)); \
71} \
72)
62#define DIV_ROUND_CLOSEST(x, divisor)( \ 73#define DIV_ROUND_CLOSEST(x, divisor)( \
63{ \ 74{ \
64 typeof(divisor) __divisor = divisor; \ 75 typeof(divisor) __divisor = divisor; \
diff --git a/include/linux/key.h b/include/linux/key.h
index cd50dfa1d4c2..3db0adce1fda 100644
--- a/include/linux/key.h
+++ b/include/linux/key.h
@@ -178,8 +178,9 @@ struct key {
178 */ 178 */
179 union { 179 union {
180 unsigned long value; 180 unsigned long value;
181 void __rcu *rcudata;
181 void *data; 182 void *data;
182 struct keyring_list *subscriptions; 183 struct keyring_list __rcu *subscriptions;
183 } payload; 184 } payload;
184}; 185};
185 186
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index c13cc48697aa..ac740b26eb10 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -205,7 +205,7 @@ struct kvm {
205 205
206 struct mutex irq_lock; 206 struct mutex irq_lock;
207#ifdef CONFIG_HAVE_KVM_IRQCHIP 207#ifdef CONFIG_HAVE_KVM_IRQCHIP
208 struct kvm_irq_routing_table *irq_routing; 208 struct kvm_irq_routing_table __rcu *irq_routing;
209 struct hlist_head mask_notifier_list; 209 struct hlist_head mask_notifier_list;
210 struct hlist_head irq_ack_notifier_list; 210 struct hlist_head irq_ack_notifier_list;
211#endif 211#endif
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 06aed8305bf3..71c09b26c759 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -32,6 +32,17 @@ extern int lock_stat;
32#define MAX_LOCKDEP_SUBCLASSES 8UL 32#define MAX_LOCKDEP_SUBCLASSES 8UL
33 33
34/* 34/*
35 * NR_LOCKDEP_CACHING_CLASSES ... Number of classes
36 * cached in the instance of lockdep_map
37 *
38 * Currently main class (subclass == 0) and signle depth subclass
39 * are cached in lockdep_map. This optimization is mainly targeting
40 * on rq->lock. double_rq_lock() acquires this highly competitive with
41 * single depth.
42 */
43#define NR_LOCKDEP_CACHING_CLASSES 2
44
45/*
35 * Lock-classes are keyed via unique addresses, by embedding the 46 * Lock-classes are keyed via unique addresses, by embedding the
36 * lockclass-key into the kernel (or module) .data section. (For 47 * lockclass-key into the kernel (or module) .data section. (For
37 * static locks we use the lock address itself as the key.) 48 * static locks we use the lock address itself as the key.)
@@ -138,7 +149,7 @@ void clear_lock_stats(struct lock_class *class);
138 */ 149 */
139struct lockdep_map { 150struct lockdep_map {
140 struct lock_class_key *key; 151 struct lock_class_key *key;
141 struct lock_class *class_cache; 152 struct lock_class *class_cache[NR_LOCKDEP_CACHING_CLASSES];
142 const char *name; 153 const char *name;
143#ifdef CONFIG_LOCK_STAT 154#ifdef CONFIG_LOCK_STAT
144 int cpu; 155 int cpu;
@@ -424,14 +435,6 @@ do { \
424 435
425#endif /* CONFIG_LOCKDEP */ 436#endif /* CONFIG_LOCKDEP */
426 437
427#ifdef CONFIG_GENERIC_HARDIRQS
428extern void early_init_irq_lock_class(void);
429#else
430static inline void early_init_irq_lock_class(void)
431{
432}
433#endif
434
435#ifdef CONFIG_TRACE_IRQFLAGS 438#ifdef CONFIG_TRACE_IRQFLAGS
436extern void early_boot_irqs_off(void); 439extern void early_boot_irqs_off(void);
437extern void early_boot_irqs_on(void); 440extern void early_boot_irqs_on(void);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index ee7e258627f9..cb57d657ce4d 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -299,7 +299,7 @@ struct mm_struct {
299 * new_owner->mm == mm 299 * new_owner->mm == mm
300 * new_owner->alloc_lock is held 300 * new_owner->alloc_lock is held
301 */ 301 */
302 struct task_struct *owner; 302 struct task_struct __rcu *owner;
303#endif 303#endif
304 304
305#ifdef CONFIG_PROC_FS 305#ifdef CONFIG_PROC_FS
diff --git a/include/linux/msi.h b/include/linux/msi.h
index 91b05c171854..05acced439a3 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -10,12 +10,13 @@ struct msi_msg {
10}; 10};
11 11
12/* Helper functions */ 12/* Helper functions */
13struct irq_desc; 13struct irq_data;
14extern void mask_msi_irq(unsigned int irq); 14struct msi_desc;
15extern void unmask_msi_irq(unsigned int irq); 15extern void mask_msi_irq(struct irq_data *data);
16extern void read_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg); 16extern void unmask_msi_irq(struct irq_data *data);
17extern void get_cached_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg); 17extern void __read_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
18extern void write_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg); 18extern void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
19extern void __write_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
19extern void read_msi_msg(unsigned int irq, struct msi_msg *msg); 20extern void read_msi_msg(unsigned int irq, struct msi_msg *msg);
20extern void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg); 21extern void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg);
21extern void write_msi_msg(unsigned int irq, struct msi_msg *msg); 22extern void write_msi_msg(unsigned int irq, struct msi_msg *msg);
diff --git a/include/linux/netfilter/nfnetlink_conntrack.h b/include/linux/netfilter/nfnetlink_conntrack.h
index 9ed534c991b9..70cd0603911c 100644
--- a/include/linux/netfilter/nfnetlink_conntrack.h
+++ b/include/linux/netfilter/nfnetlink_conntrack.h
@@ -39,8 +39,9 @@ enum ctattr_type {
39 CTA_TUPLE_MASTER, 39 CTA_TUPLE_MASTER,
40 CTA_NAT_SEQ_ADJ_ORIG, 40 CTA_NAT_SEQ_ADJ_ORIG,
41 CTA_NAT_SEQ_ADJ_REPLY, 41 CTA_NAT_SEQ_ADJ_REPLY,
42 CTA_SECMARK, 42 CTA_SECMARK, /* obsolete */
43 CTA_ZONE, 43 CTA_ZONE,
44 CTA_SECCTX,
44 __CTA_MAX 45 __CTA_MAX
45}; 46};
46#define CTA_MAX (__CTA_MAX - 1) 47#define CTA_MAX (__CTA_MAX - 1)
@@ -172,4 +173,11 @@ enum ctattr_help {
172}; 173};
173#define CTA_HELP_MAX (__CTA_HELP_MAX - 1) 174#define CTA_HELP_MAX (__CTA_HELP_MAX - 1)
174 175
176enum ctattr_secctx {
177 CTA_SECCTX_UNSPEC,
178 CTA_SECCTX_NAME,
179 __CTA_SECCTX_MAX
180};
181#define CTA_SECCTX_MAX (__CTA_SECCTX_MAX - 1)
182
175#endif /* _IPCONNTRACK_NETLINK_H */ 183#endif /* _IPCONNTRACK_NETLINK_H */
diff --git a/include/linux/netfilter/xt_SECMARK.h b/include/linux/netfilter/xt_SECMARK.h
index 6fcd3448b186..989092bd6274 100644
--- a/include/linux/netfilter/xt_SECMARK.h
+++ b/include/linux/netfilter/xt_SECMARK.h
@@ -11,18 +11,12 @@
11 * packets are being marked for. 11 * packets are being marked for.
12 */ 12 */
13#define SECMARK_MODE_SEL 0x01 /* SELinux */ 13#define SECMARK_MODE_SEL 0x01 /* SELinux */
14#define SECMARK_SELCTX_MAX 256 14#define SECMARK_SECCTX_MAX 256
15
16struct xt_secmark_target_selinux_info {
17 __u32 selsid;
18 char selctx[SECMARK_SELCTX_MAX];
19};
20 15
21struct xt_secmark_target_info { 16struct xt_secmark_target_info {
22 __u8 mode; 17 __u8 mode;
23 union { 18 __u32 secid;
24 struct xt_secmark_target_selinux_info sel; 19 char secctx[SECMARK_SECCTX_MAX];
25 } u;
26}; 20};
27 21
28#endif /*_XT_SECMARK_H_target */ 22#endif /*_XT_SECMARK_H_target */
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 508f8cf6da37..d0edf7d823ae 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -185,7 +185,7 @@ struct nfs_inode {
185 struct nfs4_cached_acl *nfs4_acl; 185 struct nfs4_cached_acl *nfs4_acl;
186 /* NFSv4 state */ 186 /* NFSv4 state */
187 struct list_head open_states; 187 struct list_head open_states;
188 struct nfs_delegation *delegation; 188 struct nfs_delegation __rcu *delegation;
189 fmode_t delegation_state; 189 fmode_t delegation_state;
190 struct rw_semaphore rwsem; 190 struct rw_semaphore rwsem;
191#endif /* CONFIG_NFS_V4*/ 191#endif /* CONFIG_NFS_V4*/
diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index b2f1a4d83550..2026f9e1ceb8 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -49,28 +49,28 @@
49 49
50struct notifier_block { 50struct notifier_block {
51 int (*notifier_call)(struct notifier_block *, unsigned long, void *); 51 int (*notifier_call)(struct notifier_block *, unsigned long, void *);
52 struct notifier_block *next; 52 struct notifier_block __rcu *next;
53 int priority; 53 int priority;
54}; 54};
55 55
56struct atomic_notifier_head { 56struct atomic_notifier_head {
57 spinlock_t lock; 57 spinlock_t lock;
58 struct notifier_block *head; 58 struct notifier_block __rcu *head;
59}; 59};
60 60
61struct blocking_notifier_head { 61struct blocking_notifier_head {
62 struct rw_semaphore rwsem; 62 struct rw_semaphore rwsem;
63 struct notifier_block *head; 63 struct notifier_block __rcu *head;
64}; 64};
65 65
66struct raw_notifier_head { 66struct raw_notifier_head {
67 struct notifier_block *head; 67 struct notifier_block __rcu *head;
68}; 68};
69 69
70struct srcu_notifier_head { 70struct srcu_notifier_head {
71 struct mutex mutex; 71 struct mutex mutex;
72 struct srcu_struct srcu; 72 struct srcu_struct srcu;
73 struct notifier_block *head; 73 struct notifier_block __rcu *head;
74}; 74};
75 75
76#define ATOMIC_INIT_NOTIFIER_HEAD(name) do { \ 76#define ATOMIC_INIT_NOTIFIER_HEAD(name) do { \
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 570fddeb0388..2615c37c8fe5 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -517,6 +517,7 @@
517#define PCI_DEVICE_ID_AMD_11H_NB_DRAM 0x1302 517#define PCI_DEVICE_ID_AMD_11H_NB_DRAM 0x1302
518#define PCI_DEVICE_ID_AMD_11H_NB_MISC 0x1303 518#define PCI_DEVICE_ID_AMD_11H_NB_MISC 0x1303
519#define PCI_DEVICE_ID_AMD_11H_NB_LINK 0x1304 519#define PCI_DEVICE_ID_AMD_11H_NB_LINK 0x1304
520#define PCI_DEVICE_ID_AMD_15H_NB_MISC 0x1603
520#define PCI_DEVICE_ID_AMD_LANCE 0x2000 521#define PCI_DEVICE_ID_AMD_LANCE 0x2000
521#define PCI_DEVICE_ID_AMD_LANCE_HOME 0x2001 522#define PCI_DEVICE_ID_AMD_LANCE_HOME 0x2001
522#define PCI_DEVICE_ID_AMD_SCSI 0x2020 523#define PCI_DEVICE_ID_AMD_SCSI 0x2020
diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
index ce2dc655cd1d..27ef6b190ea6 100644
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -139,6 +139,15 @@
139 __aligned(PAGE_SIZE) 139 __aligned(PAGE_SIZE)
140 140
141/* 141/*
142 * Declaration/definition used for per-CPU variables that must be read mostly.
143 */
144#define DECLARE_PER_CPU_READ_MOSTLY(type, name) \
145 DECLARE_PER_CPU_SECTION(type, name, "..readmostly")
146
147#define DEFINE_PER_CPU_READ_MOSTLY(type, name) \
148 DEFINE_PER_CPU_SECTION(type, name, "..readmostly")
149
150/*
142 * Intermodule exports for per-CPU variables. sparse forgets about 151 * Intermodule exports for per-CPU variables. sparse forgets about
143 * address space across EXPORT_SYMBOL(), change EXPORT_SYMBOL() to 152 * address space across EXPORT_SYMBOL(), change EXPORT_SYMBOL() to
144 * noop if __CHECKER__. 153 * noop if __CHECKER__.
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 634b8e674ac5..a39cbed9ee17 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -47,6 +47,8 @@ static inline void *radix_tree_indirect_to_ptr(void *ptr)
47{ 47{
48 return (void *)((unsigned long)ptr & ~RADIX_TREE_INDIRECT_PTR); 48 return (void *)((unsigned long)ptr & ~RADIX_TREE_INDIRECT_PTR);
49} 49}
50#define radix_tree_indirect_to_ptr(ptr) \
51 radix_tree_indirect_to_ptr((void __force *)(ptr))
50 52
51static inline int radix_tree_is_indirect_ptr(void *ptr) 53static inline int radix_tree_is_indirect_ptr(void *ptr)
52{ 54{
@@ -61,7 +63,7 @@ static inline int radix_tree_is_indirect_ptr(void *ptr)
61struct radix_tree_root { 63struct radix_tree_root {
62 unsigned int height; 64 unsigned int height;
63 gfp_t gfp_mask; 65 gfp_t gfp_mask;
64 struct radix_tree_node *rnode; 66 struct radix_tree_node __rcu *rnode;
65}; 67};
66 68
67#define RADIX_TREE_INIT(mask) { \ 69#define RADIX_TREE_INIT(mask) { \
diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index 4ec3b38ce9c5..f31ef61f1c65 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -10,6 +10,21 @@
10#include <linux/rcupdate.h> 10#include <linux/rcupdate.h>
11 11
12/* 12/*
13 * Why is there no list_empty_rcu()? Because list_empty() serves this
14 * purpose. The list_empty() function fetches the RCU-protected pointer
15 * and compares it to the address of the list head, but neither dereferences
16 * this pointer itself nor provides this pointer to the caller. Therefore,
17 * it is not necessary to use rcu_dereference(), so that list_empty() can
18 * be used anywhere you would want to use a list_empty_rcu().
19 */
20
21/*
22 * return the ->next pointer of a list_head in an rcu safe
23 * way, we must not access it directly
24 */
25#define list_next_rcu(list) (*((struct list_head __rcu **)(&(list)->next)))
26
27/*
13 * Insert a new entry between two known consecutive entries. 28 * Insert a new entry between two known consecutive entries.
14 * 29 *
15 * This is only for internal list manipulation where we know 30 * This is only for internal list manipulation where we know
@@ -20,7 +35,7 @@ static inline void __list_add_rcu(struct list_head *new,
20{ 35{
21 new->next = next; 36 new->next = next;
22 new->prev = prev; 37 new->prev = prev;
23 rcu_assign_pointer(prev->next, new); 38 rcu_assign_pointer(list_next_rcu(prev), new);
24 next->prev = new; 39 next->prev = new;
25} 40}
26 41
@@ -138,7 +153,7 @@ static inline void list_replace_rcu(struct list_head *old,
138{ 153{
139 new->next = old->next; 154 new->next = old->next;
140 new->prev = old->prev; 155 new->prev = old->prev;
141 rcu_assign_pointer(new->prev->next, new); 156 rcu_assign_pointer(list_next_rcu(new->prev), new);
142 new->next->prev = new; 157 new->next->prev = new;
143 old->prev = LIST_POISON2; 158 old->prev = LIST_POISON2;
144} 159}
@@ -193,7 +208,7 @@ static inline void list_splice_init_rcu(struct list_head *list,
193 */ 208 */
194 209
195 last->next = at; 210 last->next = at;
196 rcu_assign_pointer(head->next, first); 211 rcu_assign_pointer(list_next_rcu(head), first);
197 first->prev = head; 212 first->prev = head;
198 at->prev = last; 213 at->prev = last;
199} 214}
@@ -208,7 +223,9 @@ static inline void list_splice_init_rcu(struct list_head *list,
208 * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock(). 223 * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
209 */ 224 */
210#define list_entry_rcu(ptr, type, member) \ 225#define list_entry_rcu(ptr, type, member) \
211 container_of(rcu_dereference_raw(ptr), type, member) 226 ({typeof (*ptr) __rcu *__ptr = (typeof (*ptr) __rcu __force *)ptr; \
227 container_of((typeof(ptr))rcu_dereference_raw(__ptr), type, member); \
228 })
212 229
213/** 230/**
214 * list_first_entry_rcu - get the first element from a list 231 * list_first_entry_rcu - get the first element from a list
@@ -225,9 +242,9 @@ static inline void list_splice_init_rcu(struct list_head *list,
225 list_entry_rcu((ptr)->next, type, member) 242 list_entry_rcu((ptr)->next, type, member)
226 243
227#define __list_for_each_rcu(pos, head) \ 244#define __list_for_each_rcu(pos, head) \
228 for (pos = rcu_dereference_raw((head)->next); \ 245 for (pos = rcu_dereference_raw(list_next_rcu(head)); \
229 pos != (head); \ 246 pos != (head); \
230 pos = rcu_dereference_raw(pos->next)) 247 pos = rcu_dereference_raw(list_next_rcu((pos)))
231 248
232/** 249/**
233 * list_for_each_entry_rcu - iterate over rcu list of given type 250 * list_for_each_entry_rcu - iterate over rcu list of given type
@@ -257,9 +274,9 @@ static inline void list_splice_init_rcu(struct list_head *list,
257 * as long as the traversal is guarded by rcu_read_lock(). 274 * as long as the traversal is guarded by rcu_read_lock().
258 */ 275 */
259#define list_for_each_continue_rcu(pos, head) \ 276#define list_for_each_continue_rcu(pos, head) \
260 for ((pos) = rcu_dereference_raw((pos)->next); \ 277 for ((pos) = rcu_dereference_raw(list_next_rcu(pos)); \
261 prefetch((pos)->next), (pos) != (head); \ 278 prefetch((pos)->next), (pos) != (head); \
262 (pos) = rcu_dereference_raw((pos)->next)) 279 (pos) = rcu_dereference_raw(list_next_rcu(pos)))
263 280
264/** 281/**
265 * list_for_each_entry_continue_rcu - continue iteration over list of given type 282 * list_for_each_entry_continue_rcu - continue iteration over list of given type
@@ -314,12 +331,19 @@ static inline void hlist_replace_rcu(struct hlist_node *old,
314 331
315 new->next = next; 332 new->next = next;
316 new->pprev = old->pprev; 333 new->pprev = old->pprev;
317 rcu_assign_pointer(*new->pprev, new); 334 rcu_assign_pointer(*(struct hlist_node __rcu **)new->pprev, new);
318 if (next) 335 if (next)
319 new->next->pprev = &new->next; 336 new->next->pprev = &new->next;
320 old->pprev = LIST_POISON2; 337 old->pprev = LIST_POISON2;
321} 338}
322 339
340/*
341 * return the first or the next element in an RCU protected hlist
342 */
343#define hlist_first_rcu(head) (*((struct hlist_node __rcu **)(&(head)->first)))
344#define hlist_next_rcu(node) (*((struct hlist_node __rcu **)(&(node)->next)))
345#define hlist_pprev_rcu(node) (*((struct hlist_node __rcu **)((node)->pprev)))
346
323/** 347/**
324 * hlist_add_head_rcu 348 * hlist_add_head_rcu
325 * @n: the element to add to the hash list. 349 * @n: the element to add to the hash list.
@@ -346,7 +370,7 @@ static inline void hlist_add_head_rcu(struct hlist_node *n,
346 370
347 n->next = first; 371 n->next = first;
348 n->pprev = &h->first; 372 n->pprev = &h->first;
349 rcu_assign_pointer(h->first, n); 373 rcu_assign_pointer(hlist_first_rcu(h), n);
350 if (first) 374 if (first)
351 first->pprev = &n->next; 375 first->pprev = &n->next;
352} 376}
@@ -374,7 +398,7 @@ static inline void hlist_add_before_rcu(struct hlist_node *n,
374{ 398{
375 n->pprev = next->pprev; 399 n->pprev = next->pprev;
376 n->next = next; 400 n->next = next;
377 rcu_assign_pointer(*(n->pprev), n); 401 rcu_assign_pointer(hlist_pprev_rcu(n), n);
378 next->pprev = &n->next; 402 next->pprev = &n->next;
379} 403}
380 404
@@ -401,15 +425,15 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev,
401{ 425{
402 n->next = prev->next; 426 n->next = prev->next;
403 n->pprev = &prev->next; 427 n->pprev = &prev->next;
404 rcu_assign_pointer(prev->next, n); 428 rcu_assign_pointer(hlist_next_rcu(prev), n);
405 if (n->next) 429 if (n->next)
406 n->next->pprev = &n->next; 430 n->next->pprev = &n->next;
407} 431}
408 432
409#define __hlist_for_each_rcu(pos, head) \ 433#define __hlist_for_each_rcu(pos, head) \
410 for (pos = rcu_dereference((head)->first); \ 434 for (pos = rcu_dereference(hlist_first_rcu(head)); \
411 pos && ({ prefetch(pos->next); 1; }); \ 435 pos && ({ prefetch(pos->next); 1; }); \
412 pos = rcu_dereference(pos->next)) 436 pos = rcu_dereference(hlist_next_rcu(pos)))
413 437
414/** 438/**
415 * hlist_for_each_entry_rcu - iterate over rcu list of given type 439 * hlist_for_each_entry_rcu - iterate over rcu list of given type
@@ -422,11 +446,11 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev,
422 * the _rcu list-mutation primitives such as hlist_add_head_rcu() 446 * the _rcu list-mutation primitives such as hlist_add_head_rcu()
423 * as long as the traversal is guarded by rcu_read_lock(). 447 * as long as the traversal is guarded by rcu_read_lock().
424 */ 448 */
425#define hlist_for_each_entry_rcu(tpos, pos, head, member) \ 449#define hlist_for_each_entry_rcu(tpos, pos, head, member) \
426 for (pos = rcu_dereference_raw((head)->first); \ 450 for (pos = rcu_dereference_raw(hlist_first_rcu(head)); \
427 pos && ({ prefetch(pos->next); 1; }) && \ 451 pos && ({ prefetch(pos->next); 1; }) && \
428 ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); \ 452 ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1; }); \
429 pos = rcu_dereference_raw(pos->next)) 453 pos = rcu_dereference_raw(hlist_next_rcu(pos)))
430 454
431/** 455/**
432 * hlist_for_each_entry_rcu_bh - iterate over rcu list of given type 456 * hlist_for_each_entry_rcu_bh - iterate over rcu list of given type
diff --git a/include/linux/rculist_nulls.h b/include/linux/rculist_nulls.h
index b70ffe53cb9f..2ae13714828b 100644
--- a/include/linux/rculist_nulls.h
+++ b/include/linux/rculist_nulls.h
@@ -37,6 +37,12 @@ static inline void hlist_nulls_del_init_rcu(struct hlist_nulls_node *n)
37 } 37 }
38} 38}
39 39
40#define hlist_nulls_first_rcu(head) \
41 (*((struct hlist_nulls_node __rcu __force **)&(head)->first))
42
43#define hlist_nulls_next_rcu(node) \
44 (*((struct hlist_nulls_node __rcu __force **)&(node)->next))
45
40/** 46/**
41 * hlist_nulls_del_rcu - deletes entry from hash list without re-initialization 47 * hlist_nulls_del_rcu - deletes entry from hash list without re-initialization
42 * @n: the element to delete from the hash list. 48 * @n: the element to delete from the hash list.
@@ -88,7 +94,7 @@ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n,
88 94
89 n->next = first; 95 n->next = first;
90 n->pprev = &h->first; 96 n->pprev = &h->first;
91 rcu_assign_pointer(h->first, n); 97 rcu_assign_pointer(hlist_nulls_first_rcu(h), n);
92 if (!is_a_nulls(first)) 98 if (!is_a_nulls(first))
93 first->pprev = &n->next; 99 first->pprev = &n->next;
94} 100}
@@ -100,11 +106,11 @@ static inline void hlist_nulls_add_head_rcu(struct hlist_nulls_node *n,
100 * @member: the name of the hlist_nulls_node within the struct. 106 * @member: the name of the hlist_nulls_node within the struct.
101 * 107 *
102 */ 108 */
103#define hlist_nulls_for_each_entry_rcu(tpos, pos, head, member) \ 109#define hlist_nulls_for_each_entry_rcu(tpos, pos, head, member) \
104 for (pos = rcu_dereference_raw((head)->first); \ 110 for (pos = rcu_dereference_raw(hlist_nulls_first_rcu(head)); \
105 (!is_a_nulls(pos)) && \ 111 (!is_a_nulls(pos)) && \
106 ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1; }); \ 112 ({ tpos = hlist_nulls_entry(pos, typeof(*tpos), member); 1; }); \
107 pos = rcu_dereference_raw(pos->next)) 113 pos = rcu_dereference_raw(hlist_nulls_next_rcu(pos)))
108 114
109#endif 115#endif
110#endif 116#endif
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 83af1f8d8b74..03cda7bed985 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -41,11 +41,15 @@
41#include <linux/lockdep.h> 41#include <linux/lockdep.h>
42#include <linux/completion.h> 42#include <linux/completion.h>
43#include <linux/debugobjects.h> 43#include <linux/debugobjects.h>
44#include <linux/compiler.h>
44 45
45#ifdef CONFIG_RCU_TORTURE_TEST 46#ifdef CONFIG_RCU_TORTURE_TEST
46extern int rcutorture_runnable; /* for sysctl */ 47extern int rcutorture_runnable; /* for sysctl */
47#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ 48#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
48 49
50#define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b))
51#define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b))
52
49/** 53/**
50 * struct rcu_head - callback structure for use with RCU 54 * struct rcu_head - callback structure for use with RCU
51 * @next: next update requests in a list 55 * @next: next update requests in a list
@@ -57,29 +61,94 @@ struct rcu_head {
57}; 61};
58 62
59/* Exported common interfaces */ 63/* Exported common interfaces */
60extern void rcu_barrier(void); 64extern void call_rcu_sched(struct rcu_head *head,
65 void (*func)(struct rcu_head *rcu));
66extern void synchronize_sched(void);
61extern void rcu_barrier_bh(void); 67extern void rcu_barrier_bh(void);
62extern void rcu_barrier_sched(void); 68extern void rcu_barrier_sched(void);
63extern void synchronize_sched_expedited(void); 69extern void synchronize_sched_expedited(void);
64extern int sched_expedited_torture_stats(char *page); 70extern int sched_expedited_torture_stats(char *page);
65 71
72static inline void __rcu_read_lock_bh(void)
73{
74 local_bh_disable();
75}
76
77static inline void __rcu_read_unlock_bh(void)
78{
79 local_bh_enable();
80}
81
82#ifdef CONFIG_PREEMPT_RCU
83
84extern void __rcu_read_lock(void);
85extern void __rcu_read_unlock(void);
86void synchronize_rcu(void);
87
88/*
89 * Defined as a macro as it is a very low level header included from
90 * areas that don't even know about current. This gives the rcu_read_lock()
91 * nesting depth, but makes sense only if CONFIG_PREEMPT_RCU -- in other
92 * types of kernel builds, the rcu_read_lock() nesting depth is unknowable.
93 */
94#define rcu_preempt_depth() (current->rcu_read_lock_nesting)
95
96#else /* #ifdef CONFIG_PREEMPT_RCU */
97
98static inline void __rcu_read_lock(void)
99{
100 preempt_disable();
101}
102
103static inline void __rcu_read_unlock(void)
104{
105 preempt_enable();
106}
107
108static inline void synchronize_rcu(void)
109{
110 synchronize_sched();
111}
112
113static inline int rcu_preempt_depth(void)
114{
115 return 0;
116}
117
118#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
119
66/* Internal to kernel */ 120/* Internal to kernel */
67extern void rcu_init(void); 121extern void rcu_init(void);
122extern void rcu_sched_qs(int cpu);
123extern void rcu_bh_qs(int cpu);
124extern void rcu_check_callbacks(int cpu, int user);
125struct notifier_block;
126
127#ifdef CONFIG_NO_HZ
128
129extern void rcu_enter_nohz(void);
130extern void rcu_exit_nohz(void);
131
132#else /* #ifdef CONFIG_NO_HZ */
133
134static inline void rcu_enter_nohz(void)
135{
136}
137
138static inline void rcu_exit_nohz(void)
139{
140}
141
142#endif /* #else #ifdef CONFIG_NO_HZ */
68 143
69#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) 144#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU)
70#include <linux/rcutree.h> 145#include <linux/rcutree.h>
71#elif defined(CONFIG_TINY_RCU) 146#elif defined(CONFIG_TINY_RCU) || defined(CONFIG_TINY_PREEMPT_RCU)
72#include <linux/rcutiny.h> 147#include <linux/rcutiny.h>
73#else 148#else
74#error "Unknown RCU implementation specified to kernel configuration" 149#error "Unknown RCU implementation specified to kernel configuration"
75#endif 150#endif
76 151
77#define RCU_HEAD_INIT { .next = NULL, .func = NULL }
78#define RCU_HEAD(head) struct rcu_head head = RCU_HEAD_INIT
79#define INIT_RCU_HEAD(ptr) do { \
80 (ptr)->next = NULL; (ptr)->func = NULL; \
81} while (0)
82
83/* 152/*
84 * init_rcu_head_on_stack()/destroy_rcu_head_on_stack() are needed for dynamic 153 * init_rcu_head_on_stack()/destroy_rcu_head_on_stack() are needed for dynamic
85 * initialization and destruction of rcu_head on the stack. rcu_head structures 154 * initialization and destruction of rcu_head on the stack. rcu_head structures
@@ -120,14 +189,15 @@ extern struct lockdep_map rcu_sched_lock_map;
120extern int debug_lockdep_rcu_enabled(void); 189extern int debug_lockdep_rcu_enabled(void);
121 190
122/** 191/**
123 * rcu_read_lock_held - might we be in RCU read-side critical section? 192 * rcu_read_lock_held() - might we be in RCU read-side critical section?
124 * 193 *
125 * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an RCU 194 * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an RCU
126 * read-side critical section. In absence of CONFIG_DEBUG_LOCK_ALLOC, 195 * read-side critical section. In absence of CONFIG_DEBUG_LOCK_ALLOC,
127 * this assumes we are in an RCU read-side critical section unless it can 196 * this assumes we are in an RCU read-side critical section unless it can
128 * prove otherwise. 197 * prove otherwise. This is useful for debug checks in functions that
198 * require that they be called within an RCU read-side critical section.
129 * 199 *
130 * Check debug_lockdep_rcu_enabled() to prevent false positives during boot 200 * Checks debug_lockdep_rcu_enabled() to prevent false positives during boot
131 * and while lockdep is disabled. 201 * and while lockdep is disabled.
132 */ 202 */
133static inline int rcu_read_lock_held(void) 203static inline int rcu_read_lock_held(void)
@@ -144,14 +214,16 @@ static inline int rcu_read_lock_held(void)
144extern int rcu_read_lock_bh_held(void); 214extern int rcu_read_lock_bh_held(void);
145 215
146/** 216/**
147 * rcu_read_lock_sched_held - might we be in RCU-sched read-side critical section? 217 * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section?
148 * 218 *
149 * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an 219 * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an
150 * RCU-sched read-side critical section. In absence of 220 * RCU-sched read-side critical section. In absence of
151 * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side 221 * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side
152 * critical section unless it can prove otherwise. Note that disabling 222 * critical section unless it can prove otherwise. Note that disabling
153 * of preemption (including disabling irqs) counts as an RCU-sched 223 * of preemption (including disabling irqs) counts as an RCU-sched
154 * read-side critical section. 224 * read-side critical section. This is useful for debug checks in functions
225 * that required that they be called within an RCU-sched read-side
226 * critical section.
155 * 227 *
156 * Check debug_lockdep_rcu_enabled() to prevent false positives during boot 228 * Check debug_lockdep_rcu_enabled() to prevent false positives during boot
157 * and while lockdep is disabled. 229 * and while lockdep is disabled.
@@ -211,7 +283,11 @@ static inline int rcu_read_lock_sched_held(void)
211 283
212extern int rcu_my_thread_group_empty(void); 284extern int rcu_my_thread_group_empty(void);
213 285
214#define __do_rcu_dereference_check(c) \ 286/**
287 * rcu_lockdep_assert - emit lockdep splat if specified condition not met
288 * @c: condition to check
289 */
290#define rcu_lockdep_assert(c) \
215 do { \ 291 do { \
216 static bool __warned; \ 292 static bool __warned; \
217 if (debug_lockdep_rcu_enabled() && !__warned && !(c)) { \ 293 if (debug_lockdep_rcu_enabled() && !__warned && !(c)) { \
@@ -220,41 +296,163 @@ extern int rcu_my_thread_group_empty(void);
220 } \ 296 } \
221 } while (0) 297 } while (0)
222 298
299#else /* #ifdef CONFIG_PROVE_RCU */
300
301#define rcu_lockdep_assert(c) do { } while (0)
302
303#endif /* #else #ifdef CONFIG_PROVE_RCU */
304
305/*
306 * Helper functions for rcu_dereference_check(), rcu_dereference_protected()
307 * and rcu_assign_pointer(). Some of these could be folded into their
308 * callers, but they are left separate in order to ease introduction of
309 * multiple flavors of pointers to match the multiple flavors of RCU
310 * (e.g., __rcu_bh, * __rcu_sched, and __srcu), should this make sense in
311 * the future.
312 */
313
314#ifdef __CHECKER__
315#define rcu_dereference_sparse(p, space) \
316 ((void)(((typeof(*p) space *)p) == p))
317#else /* #ifdef __CHECKER__ */
318#define rcu_dereference_sparse(p, space)
319#endif /* #else #ifdef __CHECKER__ */
320
321#define __rcu_access_pointer(p, space) \
322 ({ \
323 typeof(*p) *_________p1 = (typeof(*p)*__force )ACCESS_ONCE(p); \
324 rcu_dereference_sparse(p, space); \
325 ((typeof(*p) __force __kernel *)(_________p1)); \
326 })
327#define __rcu_dereference_check(p, c, space) \
328 ({ \
329 typeof(*p) *_________p1 = (typeof(*p)*__force )ACCESS_ONCE(p); \
330 rcu_lockdep_assert(c); \
331 rcu_dereference_sparse(p, space); \
332 smp_read_barrier_depends(); \
333 ((typeof(*p) __force __kernel *)(_________p1)); \
334 })
335#define __rcu_dereference_protected(p, c, space) \
336 ({ \
337 rcu_lockdep_assert(c); \
338 rcu_dereference_sparse(p, space); \
339 ((typeof(*p) __force __kernel *)(p)); \
340 })
341
342#define __rcu_dereference_index_check(p, c) \
343 ({ \
344 typeof(p) _________p1 = ACCESS_ONCE(p); \
345 rcu_lockdep_assert(c); \
346 smp_read_barrier_depends(); \
347 (_________p1); \
348 })
349#define __rcu_assign_pointer(p, v, space) \
350 ({ \
351 if (!__builtin_constant_p(v) || \
352 ((v) != NULL)) \
353 smp_wmb(); \
354 (p) = (typeof(*v) __force space *)(v); \
355 })
356
357
358/**
359 * rcu_access_pointer() - fetch RCU pointer with no dereferencing
360 * @p: The pointer to read
361 *
362 * Return the value of the specified RCU-protected pointer, but omit the
363 * smp_read_barrier_depends() and keep the ACCESS_ONCE(). This is useful
364 * when the value of this pointer is accessed, but the pointer is not
365 * dereferenced, for example, when testing an RCU-protected pointer against
366 * NULL. Although rcu_access_pointer() may also be used in cases where
367 * update-side locks prevent the value of the pointer from changing, you
368 * should instead use rcu_dereference_protected() for this use case.
369 */
370#define rcu_access_pointer(p) __rcu_access_pointer((p), __rcu)
371
223/** 372/**
224 * rcu_dereference_check - rcu_dereference with debug checking 373 * rcu_dereference_check() - rcu_dereference with debug checking
225 * @p: The pointer to read, prior to dereferencing 374 * @p: The pointer to read, prior to dereferencing
226 * @c: The conditions under which the dereference will take place 375 * @c: The conditions under which the dereference will take place
227 * 376 *
228 * Do an rcu_dereference(), but check that the conditions under which the 377 * Do an rcu_dereference(), but check that the conditions under which the
229 * dereference will take place are correct. Typically the conditions indicate 378 * dereference will take place are correct. Typically the conditions
230 * the various locking conditions that should be held at that point. The check 379 * indicate the various locking conditions that should be held at that
231 * should return true if the conditions are satisfied. 380 * point. The check should return true if the conditions are satisfied.
381 * An implicit check for being in an RCU read-side critical section
382 * (rcu_read_lock()) is included.
232 * 383 *
233 * For example: 384 * For example:
234 * 385 *
235 * bar = rcu_dereference_check(foo->bar, rcu_read_lock_held() || 386 * bar = rcu_dereference_check(foo->bar, lockdep_is_held(&foo->lock));
236 * lockdep_is_held(&foo->lock));
237 * 387 *
238 * could be used to indicate to lockdep that foo->bar may only be dereferenced 388 * could be used to indicate to lockdep that foo->bar may only be dereferenced
239 * if either the RCU read lock is held, or that the lock required to replace 389 * if either rcu_read_lock() is held, or that the lock required to replace
240 * the bar struct at foo->bar is held. 390 * the bar struct at foo->bar is held.
241 * 391 *
242 * Note that the list of conditions may also include indications of when a lock 392 * Note that the list of conditions may also include indications of when a lock
243 * need not be held, for example during initialisation or destruction of the 393 * need not be held, for example during initialisation or destruction of the
244 * target struct: 394 * target struct:
245 * 395 *
246 * bar = rcu_dereference_check(foo->bar, rcu_read_lock_held() || 396 * bar = rcu_dereference_check(foo->bar, lockdep_is_held(&foo->lock) ||
247 * lockdep_is_held(&foo->lock) ||
248 * atomic_read(&foo->usage) == 0); 397 * atomic_read(&foo->usage) == 0);
398 *
399 * Inserts memory barriers on architectures that require them
400 * (currently only the Alpha), prevents the compiler from refetching
401 * (and from merging fetches), and, more importantly, documents exactly
402 * which pointers are protected by RCU and checks that the pointer is
403 * annotated as __rcu.
249 */ 404 */
250#define rcu_dereference_check(p, c) \ 405#define rcu_dereference_check(p, c) \
251 ({ \ 406 __rcu_dereference_check((p), rcu_read_lock_held() || (c), __rcu)
252 __do_rcu_dereference_check(c); \ 407
253 rcu_dereference_raw(p); \ 408/**
254 }) 409 * rcu_dereference_bh_check() - rcu_dereference_bh with debug checking
410 * @p: The pointer to read, prior to dereferencing
411 * @c: The conditions under which the dereference will take place
412 *
413 * This is the RCU-bh counterpart to rcu_dereference_check().
414 */
415#define rcu_dereference_bh_check(p, c) \
416 __rcu_dereference_check((p), rcu_read_lock_bh_held() || (c), __rcu)
255 417
256/** 418/**
257 * rcu_dereference_protected - fetch RCU pointer when updates prevented 419 * rcu_dereference_sched_check() - rcu_dereference_sched with debug checking
420 * @p: The pointer to read, prior to dereferencing
421 * @c: The conditions under which the dereference will take place
422 *
423 * This is the RCU-sched counterpart to rcu_dereference_check().
424 */
425#define rcu_dereference_sched_check(p, c) \
426 __rcu_dereference_check((p), rcu_read_lock_sched_held() || (c), \
427 __rcu)
428
429#define rcu_dereference_raw(p) rcu_dereference_check(p, 1) /*@@@ needed? @@@*/
430
431/**
432 * rcu_dereference_index_check() - rcu_dereference for indices with debug checking
433 * @p: The pointer to read, prior to dereferencing
434 * @c: The conditions under which the dereference will take place
435 *
436 * Similar to rcu_dereference_check(), but omits the sparse checking.
437 * This allows rcu_dereference_index_check() to be used on integers,
438 * which can then be used as array indices. Attempting to use
439 * rcu_dereference_check() on an integer will give compiler warnings
440 * because the sparse address-space mechanism relies on dereferencing
441 * the RCU-protected pointer. Dereferencing integers is not something
442 * that even gcc will put up with.
443 *
444 * Note that this function does not implicitly check for RCU read-side
445 * critical sections. If this function gains lots of uses, it might
446 * make sense to provide versions for each flavor of RCU, but it does
447 * not make sense as of early 2010.
448 */
449#define rcu_dereference_index_check(p, c) \
450 __rcu_dereference_index_check((p), (c))
451
452/**
453 * rcu_dereference_protected() - fetch RCU pointer when updates prevented
454 * @p: The pointer to read, prior to dereferencing
455 * @c: The conditions under which the dereference will take place
258 * 456 *
259 * Return the value of the specified RCU-protected pointer, but omit 457 * Return the value of the specified RCU-protected pointer, but omit
260 * both the smp_read_barrier_depends() and the ACCESS_ONCE(). This 458 * both the smp_read_barrier_depends() and the ACCESS_ONCE(). This
@@ -263,35 +461,61 @@ extern int rcu_my_thread_group_empty(void);
263 * prevent the compiler from repeating this reference or combining it 461 * prevent the compiler from repeating this reference or combining it
264 * with other references, so it should not be used without protection 462 * with other references, so it should not be used without protection
265 * of appropriate locks. 463 * of appropriate locks.
464 *
465 * This function is only for update-side use. Using this function
466 * when protected only by rcu_read_lock() will result in infrequent
467 * but very ugly failures.
266 */ 468 */
267#define rcu_dereference_protected(p, c) \ 469#define rcu_dereference_protected(p, c) \
268 ({ \ 470 __rcu_dereference_protected((p), (c), __rcu)
269 __do_rcu_dereference_check(c); \
270 (p); \
271 })
272 471
273#else /* #ifdef CONFIG_PROVE_RCU */ 472/**
473 * rcu_dereference_bh_protected() - fetch RCU-bh pointer when updates prevented
474 * @p: The pointer to read, prior to dereferencing
475 * @c: The conditions under which the dereference will take place
476 *
477 * This is the RCU-bh counterpart to rcu_dereference_protected().
478 */
479#define rcu_dereference_bh_protected(p, c) \
480 __rcu_dereference_protected((p), (c), __rcu)
274 481
275#define rcu_dereference_check(p, c) rcu_dereference_raw(p) 482/**
276#define rcu_dereference_protected(p, c) (p) 483 * rcu_dereference_sched_protected() - fetch RCU-sched pointer when updates prevented
484 * @p: The pointer to read, prior to dereferencing
485 * @c: The conditions under which the dereference will take place
486 *
487 * This is the RCU-sched counterpart to rcu_dereference_protected().
488 */
489#define rcu_dereference_sched_protected(p, c) \
490 __rcu_dereference_protected((p), (c), __rcu)
277 491
278#endif /* #else #ifdef CONFIG_PROVE_RCU */
279 492
280/** 493/**
281 * rcu_access_pointer - fetch RCU pointer with no dereferencing 494 * rcu_dereference() - fetch RCU-protected pointer for dereferencing
495 * @p: The pointer to read, prior to dereferencing
282 * 496 *
283 * Return the value of the specified RCU-protected pointer, but omit the 497 * This is a simple wrapper around rcu_dereference_check().
284 * smp_read_barrier_depends() and keep the ACCESS_ONCE(). This is useful 498 */
285 * when the value of this pointer is accessed, but the pointer is not 499#define rcu_dereference(p) rcu_dereference_check(p, 0)
286 * dereferenced, for example, when testing an RCU-protected pointer against 500
287 * NULL. This may also be used in cases where update-side locks prevent 501/**
288 * the value of the pointer from changing, but rcu_dereference_protected() 502 * rcu_dereference_bh() - fetch an RCU-bh-protected pointer for dereferencing
289 * is a lighter-weight primitive for this use case. 503 * @p: The pointer to read, prior to dereferencing
504 *
505 * Makes rcu_dereference_check() do the dirty work.
506 */
507#define rcu_dereference_bh(p) rcu_dereference_bh_check(p, 0)
508
509/**
510 * rcu_dereference_sched() - fetch RCU-sched-protected pointer for dereferencing
511 * @p: The pointer to read, prior to dereferencing
512 *
513 * Makes rcu_dereference_check() do the dirty work.
290 */ 514 */
291#define rcu_access_pointer(p) ACCESS_ONCE(p) 515#define rcu_dereference_sched(p) rcu_dereference_sched_check(p, 0)
292 516
293/** 517/**
294 * rcu_read_lock - mark the beginning of an RCU read-side critical section. 518 * rcu_read_lock() - mark the beginning of an RCU read-side critical section
295 * 519 *
296 * When synchronize_rcu() is invoked on one CPU while other CPUs 520 * When synchronize_rcu() is invoked on one CPU while other CPUs
297 * are within RCU read-side critical sections, then the 521 * are within RCU read-side critical sections, then the
@@ -302,7 +526,7 @@ extern int rcu_my_thread_group_empty(void);
302 * until after the all the other CPUs exit their critical sections. 526 * until after the all the other CPUs exit their critical sections.
303 * 527 *
304 * Note, however, that RCU callbacks are permitted to run concurrently 528 * Note, however, that RCU callbacks are permitted to run concurrently
305 * with RCU read-side critical sections. One way that this can happen 529 * with new RCU read-side critical sections. One way that this can happen
306 * is via the following sequence of events: (1) CPU 0 enters an RCU 530 * is via the following sequence of events: (1) CPU 0 enters an RCU
307 * read-side critical section, (2) CPU 1 invokes call_rcu() to register 531 * read-side critical section, (2) CPU 1 invokes call_rcu() to register
308 * an RCU callback, (3) CPU 0 exits the RCU read-side critical section, 532 * an RCU callback, (3) CPU 0 exits the RCU read-side critical section,
@@ -317,7 +541,20 @@ extern int rcu_my_thread_group_empty(void);
317 * will be deferred until the outermost RCU read-side critical section 541 * will be deferred until the outermost RCU read-side critical section
318 * completes. 542 * completes.
319 * 543 *
320 * It is illegal to block while in an RCU read-side critical section. 544 * You can avoid reading and understanding the next paragraph by
545 * following this rule: don't put anything in an rcu_read_lock() RCU
546 * read-side critical section that would block in a !PREEMPT kernel.
547 * But if you want the full story, read on!
548 *
549 * In non-preemptible RCU implementations (TREE_RCU and TINY_RCU), it
550 * is illegal to block while in an RCU read-side critical section. In
551 * preemptible RCU implementations (TREE_PREEMPT_RCU and TINY_PREEMPT_RCU)
552 * in CONFIG_PREEMPT kernel builds, RCU read-side critical sections may
553 * be preempted, but explicit blocking is illegal. Finally, in preemptible
554 * RCU implementations in real-time (CONFIG_PREEMPT_RT) kernel builds,
555 * RCU read-side critical sections may be preempted and they may also
556 * block, but only when acquiring spinlocks that are subject to priority
557 * inheritance.
321 */ 558 */
322static inline void rcu_read_lock(void) 559static inline void rcu_read_lock(void)
323{ 560{
@@ -337,7 +574,7 @@ static inline void rcu_read_lock(void)
337 */ 574 */
338 575
339/** 576/**
340 * rcu_read_unlock - marks the end of an RCU read-side critical section. 577 * rcu_read_unlock() - marks the end of an RCU read-side critical section.
341 * 578 *
342 * See rcu_read_lock() for more information. 579 * See rcu_read_lock() for more information.
343 */ 580 */
@@ -349,15 +586,16 @@ static inline void rcu_read_unlock(void)
349} 586}
350 587
351/** 588/**
352 * rcu_read_lock_bh - mark the beginning of a softirq-only RCU critical section 589 * rcu_read_lock_bh() - mark the beginning of an RCU-bh critical section
353 * 590 *
354 * This is equivalent of rcu_read_lock(), but to be used when updates 591 * This is equivalent of rcu_read_lock(), but to be used when updates
355 * are being done using call_rcu_bh(). Since call_rcu_bh() callbacks 592 * are being done using call_rcu_bh() or synchronize_rcu_bh(). Since
356 * consider completion of a softirq handler to be a quiescent state, 593 * both call_rcu_bh() and synchronize_rcu_bh() consider completion of a
357 * a process in RCU read-side critical section must be protected by 594 * softirq handler to be a quiescent state, a process in RCU read-side
358 * disabling softirqs. Read-side critical sections in interrupt context 595 * critical section must be protected by disabling softirqs. Read-side
359 * can use just rcu_read_lock(). 596 * critical sections in interrupt context can use just rcu_read_lock(),
360 * 597 * though this should at least be commented to avoid confusing people
598 * reading the code.
361 */ 599 */
362static inline void rcu_read_lock_bh(void) 600static inline void rcu_read_lock_bh(void)
363{ 601{
@@ -379,13 +617,12 @@ static inline void rcu_read_unlock_bh(void)
379} 617}
380 618
381/** 619/**
382 * rcu_read_lock_sched - mark the beginning of a RCU-classic critical section 620 * rcu_read_lock_sched() - mark the beginning of a RCU-sched critical section
383 * 621 *
384 * Should be used with either 622 * This is equivalent of rcu_read_lock(), but to be used when updates
385 * - synchronize_sched() 623 * are being done using call_rcu_sched() or synchronize_rcu_sched().
386 * or 624 * Read-side critical sections can also be introduced by anything that
387 * - call_rcu_sched() and rcu_barrier_sched() 625 * disables preemption, including local_irq_disable() and friends.
388 * on the write-side to insure proper synchronization.
389 */ 626 */
390static inline void rcu_read_lock_sched(void) 627static inline void rcu_read_lock_sched(void)
391{ 628{
@@ -420,54 +657,14 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
420 preempt_enable_notrace(); 657 preempt_enable_notrace();
421} 658}
422 659
423
424/** 660/**
425 * rcu_dereference_raw - fetch an RCU-protected pointer 661 * rcu_assign_pointer() - assign to RCU-protected pointer
662 * @p: pointer to assign to
663 * @v: value to assign (publish)
426 * 664 *
427 * The caller must be within some flavor of RCU read-side critical 665 * Assigns the specified value to the specified RCU-protected
428 * section, or must be otherwise preventing the pointer from changing, 666 * pointer, ensuring that any concurrent RCU readers will see
429 * for example, by holding an appropriate lock. This pointer may later 667 * any prior initialization. Returns the value assigned.
430 * be safely dereferenced. It is the caller's responsibility to have
431 * done the right thing, as this primitive does no checking of any kind.
432 *
433 * Inserts memory barriers on architectures that require them
434 * (currently only the Alpha), and, more importantly, documents
435 * exactly which pointers are protected by RCU.
436 */
437#define rcu_dereference_raw(p) ({ \
438 typeof(p) _________p1 = ACCESS_ONCE(p); \
439 smp_read_barrier_depends(); \
440 (_________p1); \
441 })
442
443/**
444 * rcu_dereference - fetch an RCU-protected pointer, checking for RCU
445 *
446 * Makes rcu_dereference_check() do the dirty work.
447 */
448#define rcu_dereference(p) \
449 rcu_dereference_check(p, rcu_read_lock_held())
450
451/**
452 * rcu_dereference_bh - fetch an RCU-protected pointer, checking for RCU-bh
453 *
454 * Makes rcu_dereference_check() do the dirty work.
455 */
456#define rcu_dereference_bh(p) \
457 rcu_dereference_check(p, rcu_read_lock_bh_held() || irqs_disabled())
458
459/**
460 * rcu_dereference_sched - fetch RCU-protected pointer, checking for RCU-sched
461 *
462 * Makes rcu_dereference_check() do the dirty work.
463 */
464#define rcu_dereference_sched(p) \
465 rcu_dereference_check(p, rcu_read_lock_sched_held())
466
467/**
468 * rcu_assign_pointer - assign (publicize) a pointer to a newly
469 * initialized structure that will be dereferenced by RCU read-side
470 * critical sections. Returns the value assigned.
471 * 668 *
472 * Inserts memory barriers on architectures that require them 669 * Inserts memory barriers on architectures that require them
473 * (pretty much all of them other than x86), and also prevents 670 * (pretty much all of them other than x86), and also prevents
@@ -476,14 +673,17 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
476 * call documents which pointers will be dereferenced by RCU read-side 673 * call documents which pointers will be dereferenced by RCU read-side
477 * code. 674 * code.
478 */ 675 */
479
480#define rcu_assign_pointer(p, v) \ 676#define rcu_assign_pointer(p, v) \
481 ({ \ 677 __rcu_assign_pointer((p), (v), __rcu)
482 if (!__builtin_constant_p(v) || \ 678
483 ((v) != NULL)) \ 679/**
484 smp_wmb(); \ 680 * RCU_INIT_POINTER() - initialize an RCU protected pointer
485 (p) = (v); \ 681 *
486 }) 682 * Initialize an RCU-protected pointer in such a way to avoid RCU-lockdep
683 * splats.
684 */
685#define RCU_INIT_POINTER(p, v) \
686 p = (typeof(*v) __force __rcu *)(v)
487 687
488/* Infrastructure to implement the synchronize_() primitives. */ 688/* Infrastructure to implement the synchronize_() primitives. */
489 689
@@ -494,26 +694,37 @@ struct rcu_synchronize {
494 694
495extern void wakeme_after_rcu(struct rcu_head *head); 695extern void wakeme_after_rcu(struct rcu_head *head);
496 696
697#ifdef CONFIG_PREEMPT_RCU
698
497/** 699/**
498 * call_rcu - Queue an RCU callback for invocation after a grace period. 700 * call_rcu() - Queue an RCU callback for invocation after a grace period.
499 * @head: structure to be used for queueing the RCU updates. 701 * @head: structure to be used for queueing the RCU updates.
500 * @func: actual update function to be invoked after the grace period 702 * @func: actual callback function to be invoked after the grace period
501 * 703 *
502 * The update function will be invoked some time after a full grace 704 * The callback function will be invoked some time after a full grace
503 * period elapses, in other words after all currently executing RCU 705 * period elapses, in other words after all pre-existing RCU read-side
504 * read-side critical sections have completed. RCU read-side critical 706 * critical sections have completed. However, the callback function
707 * might well execute concurrently with RCU read-side critical sections
708 * that started after call_rcu() was invoked. RCU read-side critical
505 * sections are delimited by rcu_read_lock() and rcu_read_unlock(), 709 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
506 * and may be nested. 710 * and may be nested.
507 */ 711 */
508extern void call_rcu(struct rcu_head *head, 712extern void call_rcu(struct rcu_head *head,
509 void (*func)(struct rcu_head *head)); 713 void (*func)(struct rcu_head *head));
510 714
715#else /* #ifdef CONFIG_PREEMPT_RCU */
716
717/* In classic RCU, call_rcu() is just call_rcu_sched(). */
718#define call_rcu call_rcu_sched
719
720#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
721
511/** 722/**
512 * call_rcu_bh - Queue an RCU for invocation after a quicker grace period. 723 * call_rcu_bh() - Queue an RCU for invocation after a quicker grace period.
513 * @head: structure to be used for queueing the RCU updates. 724 * @head: structure to be used for queueing the RCU updates.
514 * @func: actual update function to be invoked after the grace period 725 * @func: actual callback function to be invoked after the grace period
515 * 726 *
516 * The update function will be invoked some time after a full grace 727 * The callback function will be invoked some time after a full grace
517 * period elapses, in other words after all currently executing RCU 728 * period elapses, in other words after all currently executing RCU
518 * read-side critical sections have completed. call_rcu_bh() assumes 729 * read-side critical sections have completed. call_rcu_bh() assumes
519 * that the read-side critical sections end on completion of a softirq 730 * that the read-side critical sections end on completion of a softirq
@@ -566,37 +777,4 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head)
566} 777}
567#endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ 778#endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
568 779
569#ifndef CONFIG_PROVE_RCU
570#define __do_rcu_dereference_check(c) do { } while (0)
571#endif /* #ifdef CONFIG_PROVE_RCU */
572
573#define __rcu_dereference_index_check(p, c) \
574 ({ \
575 typeof(p) _________p1 = ACCESS_ONCE(p); \
576 __do_rcu_dereference_check(c); \
577 smp_read_barrier_depends(); \
578 (_________p1); \
579 })
580
581/**
582 * rcu_dereference_index_check() - rcu_dereference for indices with debug checking
583 * @p: The pointer to read, prior to dereferencing
584 * @c: The conditions under which the dereference will take place
585 *
586 * Similar to rcu_dereference_check(), but omits the sparse checking.
587 * This allows rcu_dereference_index_check() to be used on integers,
588 * which can then be used as array indices. Attempting to use
589 * rcu_dereference_check() on an integer will give compiler warnings
590 * because the sparse address-space mechanism relies on dereferencing
591 * the RCU-protected pointer. Dereferencing integers is not something
592 * that even gcc will put up with.
593 *
594 * Note that this function does not implicitly check for RCU read-side
595 * critical sections. If this function gains lots of uses, it might
596 * make sense to provide versions for each flavor of RCU, but it does
597 * not make sense as of early 2010.
598 */
599#define rcu_dereference_index_check(p, c) \
600 __rcu_dereference_index_check((p), (c))
601
602#endif /* __LINUX_RCUPDATE_H */ 780#endif /* __LINUX_RCUPDATE_H */
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index e2e893144a84..13877cb93a60 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -27,103 +27,101 @@
27 27
28#include <linux/cache.h> 28#include <linux/cache.h>
29 29
30void rcu_sched_qs(int cpu); 30#define rcu_init_sched() do { } while (0)
31void rcu_bh_qs(int cpu);
32static inline void rcu_note_context_switch(int cpu)
33{
34 rcu_sched_qs(cpu);
35}
36 31
37#define __rcu_read_lock() preempt_disable() 32#ifdef CONFIG_TINY_RCU
38#define __rcu_read_unlock() preempt_enable()
39#define __rcu_read_lock_bh() local_bh_disable()
40#define __rcu_read_unlock_bh() local_bh_enable()
41#define call_rcu_sched call_rcu
42 33
43#define rcu_init_sched() do { } while (0) 34static inline void synchronize_rcu_expedited(void)
44extern void rcu_check_callbacks(int cpu, int user); 35{
36 synchronize_sched(); /* Only one CPU, so pretty fast anyway!!! */
37}
45 38
46static inline int rcu_needs_cpu(int cpu) 39static inline void rcu_barrier(void)
47{ 40{
48 return 0; 41 rcu_barrier_sched(); /* Only one CPU, so only one list of callbacks! */
49} 42}
50 43
51/* 44#else /* #ifdef CONFIG_TINY_RCU */
52 * Return the number of grace periods. 45
53 */ 46void rcu_barrier(void);
54static inline long rcu_batches_completed(void) 47void synchronize_rcu_expedited(void);
48
49#endif /* #else #ifdef CONFIG_TINY_RCU */
50
51static inline void synchronize_rcu_bh(void)
55{ 52{
56 return 0; 53 synchronize_sched();
57} 54}
58 55
59/* 56static inline void synchronize_rcu_bh_expedited(void)
60 * Return the number of bottom-half grace periods.
61 */
62static inline long rcu_batches_completed_bh(void)
63{ 57{
64 return 0; 58 synchronize_sched();
65} 59}
66 60
67static inline void rcu_force_quiescent_state(void) 61#ifdef CONFIG_TINY_RCU
62
63static inline void rcu_preempt_note_context_switch(void)
68{ 64{
69} 65}
70 66
71static inline void rcu_bh_force_quiescent_state(void) 67static inline void exit_rcu(void)
72{ 68{
73} 69}
74 70
75static inline void rcu_sched_force_quiescent_state(void) 71static inline int rcu_needs_cpu(int cpu)
76{ 72{
73 return 0;
77} 74}
78 75
79extern void synchronize_sched(void); 76#else /* #ifdef CONFIG_TINY_RCU */
77
78void rcu_preempt_note_context_switch(void);
79extern void exit_rcu(void);
80int rcu_preempt_needs_cpu(void);
80 81
81static inline void synchronize_rcu(void) 82static inline int rcu_needs_cpu(int cpu)
82{ 83{
83 synchronize_sched(); 84 return rcu_preempt_needs_cpu();
84} 85}
85 86
86static inline void synchronize_rcu_bh(void) 87#endif /* #else #ifdef CONFIG_TINY_RCU */
88
89static inline void rcu_note_context_switch(int cpu)
87{ 90{
88 synchronize_sched(); 91 rcu_sched_qs(cpu);
92 rcu_preempt_note_context_switch();
89} 93}
90 94
91static inline void synchronize_rcu_expedited(void) 95/*
96 * Return the number of grace periods.
97 */
98static inline long rcu_batches_completed(void)
92{ 99{
93 synchronize_sched(); 100 return 0;
94} 101}
95 102
96static inline void synchronize_rcu_bh_expedited(void) 103/*
104 * Return the number of bottom-half grace periods.
105 */
106static inline long rcu_batches_completed_bh(void)
97{ 107{
98 synchronize_sched(); 108 return 0;
99} 109}
100 110
101struct notifier_block; 111static inline void rcu_force_quiescent_state(void)
102
103#ifdef CONFIG_NO_HZ
104
105extern void rcu_enter_nohz(void);
106extern void rcu_exit_nohz(void);
107
108#else /* #ifdef CONFIG_NO_HZ */
109
110static inline void rcu_enter_nohz(void)
111{ 112{
112} 113}
113 114
114static inline void rcu_exit_nohz(void) 115static inline void rcu_bh_force_quiescent_state(void)
115{ 116{
116} 117}
117 118
118#endif /* #else #ifdef CONFIG_NO_HZ */ 119static inline void rcu_sched_force_quiescent_state(void)
119
120static inline void exit_rcu(void)
121{ 120{
122} 121}
123 122
124static inline int rcu_preempt_depth(void) 123static inline void rcu_cpu_stall_reset(void)
125{ 124{
126 return 0;
127} 125}
128 126
129#ifdef CONFIG_DEBUG_LOCK_ALLOC 127#ifdef CONFIG_DEBUG_LOCK_ALLOC
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index c0ed1c056f29..95518e628794 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -30,64 +30,23 @@
30#ifndef __LINUX_RCUTREE_H 30#ifndef __LINUX_RCUTREE_H
31#define __LINUX_RCUTREE_H 31#define __LINUX_RCUTREE_H
32 32
33struct notifier_block;
34
35extern void rcu_sched_qs(int cpu);
36extern void rcu_bh_qs(int cpu);
37extern void rcu_note_context_switch(int cpu); 33extern void rcu_note_context_switch(int cpu);
38extern int rcu_needs_cpu(int cpu); 34extern int rcu_needs_cpu(int cpu);
35extern void rcu_cpu_stall_reset(void);
39 36
40#ifdef CONFIG_TREE_PREEMPT_RCU 37#ifdef CONFIG_TREE_PREEMPT_RCU
41 38
42extern void __rcu_read_lock(void);
43extern void __rcu_read_unlock(void);
44extern void synchronize_rcu(void);
45extern void exit_rcu(void); 39extern void exit_rcu(void);
46 40
47/*
48 * Defined as macro as it is a very low level header
49 * included from areas that don't even know about current
50 */
51#define rcu_preempt_depth() (current->rcu_read_lock_nesting)
52
53#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 41#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
54 42
55static inline void __rcu_read_lock(void)
56{
57 preempt_disable();
58}
59
60static inline void __rcu_read_unlock(void)
61{
62 preempt_enable();
63}
64
65#define synchronize_rcu synchronize_sched
66
67static inline void exit_rcu(void) 43static inline void exit_rcu(void)
68{ 44{
69} 45}
70 46
71static inline int rcu_preempt_depth(void)
72{
73 return 0;
74}
75
76#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ 47#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
77 48
78static inline void __rcu_read_lock_bh(void)
79{
80 local_bh_disable();
81}
82static inline void __rcu_read_unlock_bh(void)
83{
84 local_bh_enable();
85}
86
87extern void call_rcu_sched(struct rcu_head *head,
88 void (*func)(struct rcu_head *rcu));
89extern void synchronize_rcu_bh(void); 49extern void synchronize_rcu_bh(void);
90extern void synchronize_sched(void);
91extern void synchronize_rcu_expedited(void); 50extern void synchronize_rcu_expedited(void);
92 51
93static inline void synchronize_rcu_bh_expedited(void) 52static inline void synchronize_rcu_bh_expedited(void)
@@ -95,7 +54,7 @@ static inline void synchronize_rcu_bh_expedited(void)
95 synchronize_sched_expedited(); 54 synchronize_sched_expedited();
96} 55}
97 56
98extern void rcu_check_callbacks(int cpu, int user); 57extern void rcu_barrier(void);
99 58
100extern long rcu_batches_completed(void); 59extern long rcu_batches_completed(void);
101extern long rcu_batches_completed_bh(void); 60extern long rcu_batches_completed_bh(void);
@@ -104,18 +63,6 @@ extern void rcu_force_quiescent_state(void);
104extern void rcu_bh_force_quiescent_state(void); 63extern void rcu_bh_force_quiescent_state(void);
105extern void rcu_sched_force_quiescent_state(void); 64extern void rcu_sched_force_quiescent_state(void);
106 65
107#ifdef CONFIG_NO_HZ
108void rcu_enter_nohz(void);
109void rcu_exit_nohz(void);
110#else /* CONFIG_NO_HZ */
111static inline void rcu_enter_nohz(void)
112{
113}
114static inline void rcu_exit_nohz(void)
115{
116}
117#endif /* CONFIG_NO_HZ */
118
119/* A context switch is a grace period for RCU-sched and RCU-bh. */ 66/* A context switch is a grace period for RCU-sched and RCU-bh. */
120static inline int rcu_blocking_is_gp(void) 67static inline int rcu_blocking_is_gp(void)
121{ 68{
diff --git a/include/linux/sched.h b/include/linux/sched.h
index eb3c1ceec06e..0383601a927c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -875,6 +875,7 @@ enum sched_domain_level {
875 SD_LV_NONE = 0, 875 SD_LV_NONE = 0,
876 SD_LV_SIBLING, 876 SD_LV_SIBLING,
877 SD_LV_MC, 877 SD_LV_MC,
878 SD_LV_BOOK,
878 SD_LV_CPU, 879 SD_LV_CPU,
879 SD_LV_NODE, 880 SD_LV_NODE,
880 SD_LV_ALLNODES, 881 SD_LV_ALLNODES,
@@ -1209,11 +1210,13 @@ struct task_struct {
1209 unsigned int policy; 1210 unsigned int policy;
1210 cpumask_t cpus_allowed; 1211 cpumask_t cpus_allowed;
1211 1212
1212#ifdef CONFIG_TREE_PREEMPT_RCU 1213#ifdef CONFIG_PREEMPT_RCU
1213 int rcu_read_lock_nesting; 1214 int rcu_read_lock_nesting;
1214 char rcu_read_unlock_special; 1215 char rcu_read_unlock_special;
1215 struct rcu_node *rcu_blocked_node;
1216 struct list_head rcu_node_entry; 1216 struct list_head rcu_node_entry;
1217#endif /* #ifdef CONFIG_PREEMPT_RCU */
1218#ifdef CONFIG_TREE_PREEMPT_RCU
1219 struct rcu_node *rcu_blocked_node;
1217#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 1220#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
1218 1221
1219#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 1222#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
@@ -1295,9 +1298,9 @@ struct task_struct {
1295 struct list_head cpu_timers[3]; 1298 struct list_head cpu_timers[3];
1296 1299
1297/* process credentials */ 1300/* process credentials */
1298 const struct cred *real_cred; /* objective and real subjective task 1301 const struct cred __rcu *real_cred; /* objective and real subjective task
1299 * credentials (COW) */ 1302 * credentials (COW) */
1300 const struct cred *cred; /* effective (overridable) subjective task 1303 const struct cred __rcu *cred; /* effective (overridable) subjective task
1301 * credentials (COW) */ 1304 * credentials (COW) */
1302 struct mutex cred_guard_mutex; /* guard against foreign influences on 1305 struct mutex cred_guard_mutex; /* guard against foreign influences on
1303 * credential calculations 1306 * credential calculations
@@ -1425,7 +1428,7 @@ struct task_struct {
1425#endif 1428#endif
1426#ifdef CONFIG_CGROUPS 1429#ifdef CONFIG_CGROUPS
1427 /* Control Group info protected by css_set_lock */ 1430 /* Control Group info protected by css_set_lock */
1428 struct css_set *cgroups; 1431 struct css_set __rcu *cgroups;
1429 /* cg_list protected by css_set_lock and tsk->alloc_lock */ 1432 /* cg_list protected by css_set_lock and tsk->alloc_lock */
1430 struct list_head cg_list; 1433 struct list_head cg_list;
1431#endif 1434#endif
@@ -1688,8 +1691,7 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
1688/* 1691/*
1689 * Per process flags 1692 * Per process flags
1690 */ 1693 */
1691#define PF_ALIGNWARN 0x00000001 /* Print alignment warning msgs */ 1694#define PF_KSOFTIRQD 0x00000001 /* I am ksoftirqd */
1692 /* Not implemented yet, only for 486*/
1693#define PF_STARTING 0x00000002 /* being created */ 1695#define PF_STARTING 0x00000002 /* being created */
1694#define PF_EXITING 0x00000004 /* getting shut down */ 1696#define PF_EXITING 0x00000004 /* getting shut down */
1695#define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ 1697#define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */
@@ -1747,7 +1749,7 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
1747#define tsk_used_math(p) ((p)->flags & PF_USED_MATH) 1749#define tsk_used_math(p) ((p)->flags & PF_USED_MATH)
1748#define used_math() tsk_used_math(current) 1750#define used_math() tsk_used_math(current)
1749 1751
1750#ifdef CONFIG_TREE_PREEMPT_RCU 1752#ifdef CONFIG_PREEMPT_RCU
1751 1753
1752#define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */ 1754#define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */
1753#define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */ 1755#define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */
@@ -1756,7 +1758,9 @@ static inline void rcu_copy_process(struct task_struct *p)
1756{ 1758{
1757 p->rcu_read_lock_nesting = 0; 1759 p->rcu_read_lock_nesting = 0;
1758 p->rcu_read_unlock_special = 0; 1760 p->rcu_read_unlock_special = 0;
1761#ifdef CONFIG_TREE_PREEMPT_RCU
1759 p->rcu_blocked_node = NULL; 1762 p->rcu_blocked_node = NULL;
1763#endif
1760 INIT_LIST_HEAD(&p->rcu_node_entry); 1764 INIT_LIST_HEAD(&p->rcu_node_entry);
1761} 1765}
1762 1766
@@ -1833,6 +1837,19 @@ extern void sched_clock_idle_sleep_event(void);
1833extern void sched_clock_idle_wakeup_event(u64 delta_ns); 1837extern void sched_clock_idle_wakeup_event(u64 delta_ns);
1834#endif 1838#endif
1835 1839
1840#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1841/*
1842 * An i/f to runtime opt-in for irq time accounting based off of sched_clock.
1843 * The reason for this explicit opt-in is not to have perf penalty with
1844 * slow sched_clocks.
1845 */
1846extern void enable_sched_clock_irqtime(void);
1847extern void disable_sched_clock_irqtime(void);
1848#else
1849static inline void enable_sched_clock_irqtime(void) {}
1850static inline void disable_sched_clock_irqtime(void) {}
1851#endif
1852
1836extern unsigned long long 1853extern unsigned long long
1837task_sched_runtime(struct task_struct *task); 1854task_sched_runtime(struct task_struct *task);
1838extern unsigned long long thread_group_sched_runtime(struct task_struct *task); 1855extern unsigned long long thread_group_sched_runtime(struct task_struct *task);
@@ -2374,9 +2391,9 @@ extern int __cond_resched_lock(spinlock_t *lock);
2374 2391
2375extern int __cond_resched_softirq(void); 2392extern int __cond_resched_softirq(void);
2376 2393
2377#define cond_resched_softirq() ({ \ 2394#define cond_resched_softirq() ({ \
2378 __might_sleep(__FILE__, __LINE__, SOFTIRQ_OFFSET); \ 2395 __might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET); \
2379 __cond_resched_softirq(); \ 2396 __cond_resched_softirq(); \
2380}) 2397})
2381 2398
2382/* 2399/*
diff --git a/include/linux/security.h b/include/linux/security.h
index a22219afff09..b8246a8df7d2 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -74,7 +74,7 @@ extern int cap_file_mmap(struct file *file, unsigned long reqprot,
74extern int cap_task_fix_setuid(struct cred *new, const struct cred *old, int flags); 74extern int cap_task_fix_setuid(struct cred *new, const struct cred *old, int flags);
75extern int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3, 75extern int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
76 unsigned long arg4, unsigned long arg5); 76 unsigned long arg4, unsigned long arg5);
77extern int cap_task_setscheduler(struct task_struct *p, int policy, struct sched_param *lp); 77extern int cap_task_setscheduler(struct task_struct *p);
78extern int cap_task_setioprio(struct task_struct *p, int ioprio); 78extern int cap_task_setioprio(struct task_struct *p, int ioprio);
79extern int cap_task_setnice(struct task_struct *p, int nice); 79extern int cap_task_setnice(struct task_struct *p, int nice);
80extern int cap_syslog(int type, bool from_file); 80extern int cap_syslog(int type, bool from_file);
@@ -959,6 +959,12 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
959 * Sets the new child socket's sid to the openreq sid. 959 * Sets the new child socket's sid to the openreq sid.
960 * @inet_conn_established: 960 * @inet_conn_established:
961 * Sets the connection's peersid to the secmark on skb. 961 * Sets the connection's peersid to the secmark on skb.
962 * @secmark_relabel_packet:
963 * check if the process should be allowed to relabel packets to the given secid
964 * @security_secmark_refcount_inc
965 * tells the LSM to increment the number of secmark labeling rules loaded
966 * @security_secmark_refcount_dec
967 * tells the LSM to decrement the number of secmark labeling rules loaded
962 * @req_classify_flow: 968 * @req_classify_flow:
963 * Sets the flow's sid to the openreq sid. 969 * Sets the flow's sid to the openreq sid.
964 * @tun_dev_create: 970 * @tun_dev_create:
@@ -1279,9 +1285,13 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
1279 * Return 0 if permission is granted. 1285 * Return 0 if permission is granted.
1280 * 1286 *
1281 * @secid_to_secctx: 1287 * @secid_to_secctx:
1282 * Convert secid to security context. 1288 * Convert secid to security context. If secdata is NULL the length of
1289 * the result will be returned in seclen, but no secdata will be returned.
1290 * This does mean that the length could change between calls to check the
1291 * length and the next call which actually allocates and returns the secdata.
1283 * @secid contains the security ID. 1292 * @secid contains the security ID.
1284 * @secdata contains the pointer that stores the converted security context. 1293 * @secdata contains the pointer that stores the converted security context.
1294 * @seclen pointer which contains the length of the data
1285 * @secctx_to_secid: 1295 * @secctx_to_secid:
1286 * Convert security context to secid. 1296 * Convert security context to secid.
1287 * @secid contains the pointer to the generated security ID. 1297 * @secid contains the pointer to the generated security ID.
@@ -1501,8 +1511,7 @@ struct security_operations {
1501 int (*task_getioprio) (struct task_struct *p); 1511 int (*task_getioprio) (struct task_struct *p);
1502 int (*task_setrlimit) (struct task_struct *p, unsigned int resource, 1512 int (*task_setrlimit) (struct task_struct *p, unsigned int resource,
1503 struct rlimit *new_rlim); 1513 struct rlimit *new_rlim);
1504 int (*task_setscheduler) (struct task_struct *p, int policy, 1514 int (*task_setscheduler) (struct task_struct *p);
1505 struct sched_param *lp);
1506 int (*task_getscheduler) (struct task_struct *p); 1515 int (*task_getscheduler) (struct task_struct *p);
1507 int (*task_movememory) (struct task_struct *p); 1516 int (*task_movememory) (struct task_struct *p);
1508 int (*task_kill) (struct task_struct *p, 1517 int (*task_kill) (struct task_struct *p,
@@ -1594,6 +1603,9 @@ struct security_operations {
1594 struct request_sock *req); 1603 struct request_sock *req);
1595 void (*inet_csk_clone) (struct sock *newsk, const struct request_sock *req); 1604 void (*inet_csk_clone) (struct sock *newsk, const struct request_sock *req);
1596 void (*inet_conn_established) (struct sock *sk, struct sk_buff *skb); 1605 void (*inet_conn_established) (struct sock *sk, struct sk_buff *skb);
1606 int (*secmark_relabel_packet) (u32 secid);
1607 void (*secmark_refcount_inc) (void);
1608 void (*secmark_refcount_dec) (void);
1597 void (*req_classify_flow) (const struct request_sock *req, struct flowi *fl); 1609 void (*req_classify_flow) (const struct request_sock *req, struct flowi *fl);
1598 int (*tun_dev_create)(void); 1610 int (*tun_dev_create)(void);
1599 void (*tun_dev_post_create)(struct sock *sk); 1611 void (*tun_dev_post_create)(struct sock *sk);
@@ -1752,8 +1764,7 @@ int security_task_setioprio(struct task_struct *p, int ioprio);
1752int security_task_getioprio(struct task_struct *p); 1764int security_task_getioprio(struct task_struct *p);
1753int security_task_setrlimit(struct task_struct *p, unsigned int resource, 1765int security_task_setrlimit(struct task_struct *p, unsigned int resource,
1754 struct rlimit *new_rlim); 1766 struct rlimit *new_rlim);
1755int security_task_setscheduler(struct task_struct *p, 1767int security_task_setscheduler(struct task_struct *p);
1756 int policy, struct sched_param *lp);
1757int security_task_getscheduler(struct task_struct *p); 1768int security_task_getscheduler(struct task_struct *p);
1758int security_task_movememory(struct task_struct *p); 1769int security_task_movememory(struct task_struct *p);
1759int security_task_kill(struct task_struct *p, struct siginfo *info, 1770int security_task_kill(struct task_struct *p, struct siginfo *info,
@@ -2320,11 +2331,9 @@ static inline int security_task_setrlimit(struct task_struct *p,
2320 return 0; 2331 return 0;
2321} 2332}
2322 2333
2323static inline int security_task_setscheduler(struct task_struct *p, 2334static inline int security_task_setscheduler(struct task_struct *p)
2324 int policy,
2325 struct sched_param *lp)
2326{ 2335{
2327 return cap_task_setscheduler(p, policy, lp); 2336 return cap_task_setscheduler(p);
2328} 2337}
2329 2338
2330static inline int security_task_getscheduler(struct task_struct *p) 2339static inline int security_task_getscheduler(struct task_struct *p)
@@ -2551,6 +2560,9 @@ void security_inet_csk_clone(struct sock *newsk,
2551 const struct request_sock *req); 2560 const struct request_sock *req);
2552void security_inet_conn_established(struct sock *sk, 2561void security_inet_conn_established(struct sock *sk,
2553 struct sk_buff *skb); 2562 struct sk_buff *skb);
2563int security_secmark_relabel_packet(u32 secid);
2564void security_secmark_refcount_inc(void);
2565void security_secmark_refcount_dec(void);
2554int security_tun_dev_create(void); 2566int security_tun_dev_create(void);
2555void security_tun_dev_post_create(struct sock *sk); 2567void security_tun_dev_post_create(struct sock *sk);
2556int security_tun_dev_attach(struct sock *sk); 2568int security_tun_dev_attach(struct sock *sk);
@@ -2705,6 +2717,19 @@ static inline void security_inet_conn_established(struct sock *sk,
2705{ 2717{
2706} 2718}
2707 2719
2720static inline int security_secmark_relabel_packet(u32 secid)
2721{
2722 return 0;
2723}
2724
2725static inline void security_secmark_refcount_inc(void)
2726{
2727}
2728
2729static inline void security_secmark_refcount_dec(void)
2730{
2731}
2732
2708static inline int security_tun_dev_create(void) 2733static inline int security_tun_dev_create(void)
2709{ 2734{
2710 return 0; 2735 return 0;
diff --git a/include/linux/selinux.h b/include/linux/selinux.h
index 82e0f26a1299..44f459612690 100644
--- a/include/linux/selinux.h
+++ b/include/linux/selinux.h
@@ -21,74 +21,11 @@ struct kern_ipc_perm;
21#ifdef CONFIG_SECURITY_SELINUX 21#ifdef CONFIG_SECURITY_SELINUX
22 22
23/** 23/**
24 * selinux_string_to_sid - map a security context string to a security ID
25 * @str: the security context string to be mapped
26 * @sid: ID value returned via this.
27 *
28 * Returns 0 if successful, with the SID stored in sid. A value
29 * of zero for sid indicates no SID could be determined (but no error
30 * occurred).
31 */
32int selinux_string_to_sid(char *str, u32 *sid);
33
34/**
35 * selinux_secmark_relabel_packet_permission - secmark permission check
36 * @sid: SECMARK ID value to be applied to network packet
37 *
38 * Returns 0 if the current task is allowed to set the SECMARK label of
39 * packets with the supplied security ID. Note that it is implicit that
40 * the packet is always being relabeled from the default unlabeled value,
41 * and that the access control decision is made in the AVC.
42 */
43int selinux_secmark_relabel_packet_permission(u32 sid);
44
45/**
46 * selinux_secmark_refcount_inc - increments the secmark use counter
47 *
48 * SELinux keeps track of the current SECMARK targets in use so it knows
49 * when to apply SECMARK label access checks to network packets. This
50 * function incements this reference count to indicate that a new SECMARK
51 * target has been configured.
52 */
53void selinux_secmark_refcount_inc(void);
54
55/**
56 * selinux_secmark_refcount_dec - decrements the secmark use counter
57 *
58 * SELinux keeps track of the current SECMARK targets in use so it knows
59 * when to apply SECMARK label access checks to network packets. This
60 * function decements this reference count to indicate that one of the
61 * existing SECMARK targets has been removed/flushed.
62 */
63void selinux_secmark_refcount_dec(void);
64
65/**
66 * selinux_is_enabled - is SELinux enabled? 24 * selinux_is_enabled - is SELinux enabled?
67 */ 25 */
68bool selinux_is_enabled(void); 26bool selinux_is_enabled(void);
69#else 27#else
70 28
71static inline int selinux_string_to_sid(const char *str, u32 *sid)
72{
73 *sid = 0;
74 return 0;
75}
76
77static inline int selinux_secmark_relabel_packet_permission(u32 sid)
78{
79 return 0;
80}
81
82static inline void selinux_secmark_refcount_inc(void)
83{
84 return;
85}
86
87static inline void selinux_secmark_refcount_dec(void)
88{
89 return;
90}
91
92static inline bool selinux_is_enabled(void) 29static inline bool selinux_is_enabled(void)
93{ 30{
94 return false; 31 return false;
diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 4d5d2f546dbf..58971e891f48 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -108,19 +108,43 @@ static inline int srcu_read_lock_held(struct srcu_struct *sp)
108#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 108#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
109 109
110/** 110/**
111 * srcu_dereference - fetch SRCU-protected pointer with checking 111 * srcu_dereference_check - fetch SRCU-protected pointer for later dereferencing
112 * @p: the pointer to fetch and protect for later dereferencing
113 * @sp: pointer to the srcu_struct, which is used to check that we
114 * really are in an SRCU read-side critical section.
115 * @c: condition to check for update-side use
112 * 116 *
113 * Makes rcu_dereference_check() do the dirty work. 117 * If PROVE_RCU is enabled, invoking this outside of an RCU read-side
118 * critical section will result in an RCU-lockdep splat, unless @c evaluates
119 * to 1. The @c argument will normally be a logical expression containing
120 * lockdep_is_held() calls.
114 */ 121 */
115#define srcu_dereference(p, sp) \ 122#define srcu_dereference_check(p, sp, c) \
116 rcu_dereference_check(p, srcu_read_lock_held(sp)) 123 __rcu_dereference_check((p), srcu_read_lock_held(sp) || (c), __rcu)
124
125/**
126 * srcu_dereference - fetch SRCU-protected pointer for later dereferencing
127 * @p: the pointer to fetch and protect for later dereferencing
128 * @sp: pointer to the srcu_struct, which is used to check that we
129 * really are in an SRCU read-side critical section.
130 *
131 * Makes rcu_dereference_check() do the dirty work. If PROVE_RCU
132 * is enabled, invoking this outside of an RCU read-side critical
133 * section will result in an RCU-lockdep splat.
134 */
135#define srcu_dereference(p, sp) srcu_dereference_check((p), (sp), 0)
117 136
118/** 137/**
119 * srcu_read_lock - register a new reader for an SRCU-protected structure. 138 * srcu_read_lock - register a new reader for an SRCU-protected structure.
120 * @sp: srcu_struct in which to register the new reader. 139 * @sp: srcu_struct in which to register the new reader.
121 * 140 *
122 * Enter an SRCU read-side critical section. Note that SRCU read-side 141 * Enter an SRCU read-side critical section. Note that SRCU read-side
123 * critical sections may be nested. 142 * critical sections may be nested. However, it is illegal to
143 * call anything that waits on an SRCU grace period for the same
144 * srcu_struct, whether directly or indirectly. Please note that
145 * one way to indirectly wait on an SRCU grace period is to acquire
146 * a mutex that is held elsewhere while calling synchronize_srcu() or
147 * synchronize_srcu_expedited().
124 */ 148 */
125static inline int srcu_read_lock(struct srcu_struct *sp) __acquires(sp) 149static inline int srcu_read_lock(struct srcu_struct *sp) __acquires(sp)
126{ 150{
diff --git a/include/linux/sunrpc/auth_gss.h b/include/linux/sunrpc/auth_gss.h
index 671538d25bc1..8eee9dbbfe7a 100644
--- a/include/linux/sunrpc/auth_gss.h
+++ b/include/linux/sunrpc/auth_gss.h
@@ -69,7 +69,7 @@ struct gss_cl_ctx {
69 enum rpc_gss_proc gc_proc; 69 enum rpc_gss_proc gc_proc;
70 u32 gc_seq; 70 u32 gc_seq;
71 spinlock_t gc_seq_lock; 71 spinlock_t gc_seq_lock;
72 struct gss_ctx *gc_gss_ctx; 72 struct gss_ctx __rcu *gc_gss_ctx;
73 struct xdr_netobj gc_wire_ctx; 73 struct xdr_netobj gc_wire_ctx;
74 u32 gc_win; 74 u32 gc_win;
75 unsigned long gc_expiry; 75 unsigned long gc_expiry;
@@ -80,7 +80,7 @@ struct gss_upcall_msg;
80struct gss_cred { 80struct gss_cred {
81 struct rpc_cred gc_base; 81 struct rpc_cred gc_base;
82 enum rpc_gss_svc gc_service; 82 enum rpc_gss_svc gc_service;
83 struct gss_cl_ctx *gc_ctx; 83 struct gss_cl_ctx __rcu *gc_ctx;
84 struct gss_upcall_msg *gc_upcall; 84 struct gss_upcall_msg *gc_upcall;
85 unsigned long gc_upcall_timestamp; 85 unsigned long gc_upcall_timestamp;
86 unsigned char gc_machine_cred : 1; 86 unsigned char gc_machine_cred : 1;
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index a8cc4e13434c..c90696544176 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -23,12 +23,12 @@ struct restart_block {
23 }; 23 };
24 /* For futex_wait and futex_wait_requeue_pi */ 24 /* For futex_wait and futex_wait_requeue_pi */
25 struct { 25 struct {
26 u32 *uaddr; 26 u32 __user *uaddr;
27 u32 val; 27 u32 val;
28 u32 flags; 28 u32 flags;
29 u32 bitset; 29 u32 bitset;
30 u64 time; 30 u64 time;
31 u32 *uaddr2; 31 u32 __user *uaddr2;
32 } futex; 32 } futex;
33 /* For nanosleep */ 33 /* For nanosleep */
34 struct { 34 struct {
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 64e084ff5e5c..b91a40e847d2 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -201,6 +201,12 @@ int arch_update_cpu_topology(void);
201 .balance_interval = 64, \ 201 .balance_interval = 64, \
202} 202}
203 203
204#ifdef CONFIG_SCHED_BOOK
205#ifndef SD_BOOK_INIT
206#error Please define an appropriate SD_BOOK_INIT in include/asm/topology.h!!!
207#endif
208#endif /* CONFIG_SCHED_BOOK */
209
204#ifdef CONFIG_NUMA 210#ifdef CONFIG_NUMA
205#ifndef SD_NODE_INIT 211#ifndef SD_NODE_INIT
206#error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!! 212#error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!!
diff --git a/include/net/cls_cgroup.h b/include/net/cls_cgroup.h
index ef6c24a529e1..a4dc5b027bd9 100644
--- a/include/net/cls_cgroup.h
+++ b/include/net/cls_cgroup.h
@@ -51,7 +51,8 @@ static inline u32 task_cls_classid(struct task_struct *p)
51 return 0; 51 return 0;
52 52
53 rcu_read_lock(); 53 rcu_read_lock();
54 id = rcu_dereference(net_cls_subsys_id); 54 id = rcu_dereference_index_check(net_cls_subsys_id,
55 rcu_read_lock_held());
55 if (id >= 0) 56 if (id >= 0)
56 classid = container_of(task_subsys_state(p, id), 57 classid = container_of(task_subsys_state(p, id),
57 struct cgroup_cls_state, css)->classid; 58 struct cgroup_cls_state, css)->classid;
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index e624dae54fa4..caf17db87dbc 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -75,7 +75,7 @@ struct nf_conntrack_helper;
75/* nf_conn feature for connections that have a helper */ 75/* nf_conn feature for connections that have a helper */
76struct nf_conn_help { 76struct nf_conn_help {
77 /* Helper. if any */ 77 /* Helper. if any */
78 struct nf_conntrack_helper *helper; 78 struct nf_conntrack_helper __rcu *helper;
79 79
80 union nf_conntrack_help help; 80 union nf_conntrack_help help;
81 81
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 9208c92aeab5..f6334782a593 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -362,6 +362,35 @@ TRACE_EVENT(sched_stat_runtime,
362 (unsigned long long)__entry->vruntime) 362 (unsigned long long)__entry->vruntime)
363); 363);
364 364
365/*
366 * Tracepoint for showing priority inheritance modifying a tasks
367 * priority.
368 */
369TRACE_EVENT(sched_pi_setprio,
370
371 TP_PROTO(struct task_struct *tsk, int newprio),
372
373 TP_ARGS(tsk, newprio),
374
375 TP_STRUCT__entry(
376 __array( char, comm, TASK_COMM_LEN )
377 __field( pid_t, pid )
378 __field( int, oldprio )
379 __field( int, newprio )
380 ),
381
382 TP_fast_assign(
383 memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
384 __entry->pid = tsk->pid;
385 __entry->oldprio = tsk->prio;
386 __entry->newprio = newprio;
387 ),
388
389 TP_printk("comm=%s pid=%d oldprio=%d newprio=%d",
390 __entry->comm, __entry->pid,
391 __entry->oldprio, __entry->newprio)
392);
393
365#endif /* _TRACE_SCHED_H */ 394#endif /* _TRACE_SCHED_H */
366 395
367/* This part must be outside protection */ 396/* This part must be outside protection */
diff --git a/init/Kconfig b/init/Kconfig
index 1ef0b439908e..36890f0c8456 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -339,6 +339,8 @@ config AUDIT_TREE
339 depends on AUDITSYSCALL 339 depends on AUDITSYSCALL
340 select FSNOTIFY 340 select FSNOTIFY
341 341
342source "kernel/irq/Kconfig"
343
342menu "RCU Subsystem" 344menu "RCU Subsystem"
343 345
344choice 346choice
@@ -347,6 +349,7 @@ choice
347 349
348config TREE_RCU 350config TREE_RCU
349 bool "Tree-based hierarchical RCU" 351 bool "Tree-based hierarchical RCU"
352 depends on !PREEMPT && SMP
350 help 353 help
351 This option selects the RCU implementation that is 354 This option selects the RCU implementation that is
352 designed for very large SMP system with hundreds or 355 designed for very large SMP system with hundreds or
@@ -354,7 +357,7 @@ config TREE_RCU
354 smaller systems. 357 smaller systems.
355 358
356config TREE_PREEMPT_RCU 359config TREE_PREEMPT_RCU
357 bool "Preemptable tree-based hierarchical RCU" 360 bool "Preemptible tree-based hierarchical RCU"
358 depends on PREEMPT 361 depends on PREEMPT
359 help 362 help
360 This option selects the RCU implementation that is 363 This option selects the RCU implementation that is
@@ -372,8 +375,22 @@ config TINY_RCU
372 is not required. This option greatly reduces the 375 is not required. This option greatly reduces the
373 memory footprint of RCU. 376 memory footprint of RCU.
374 377
378config TINY_PREEMPT_RCU
379 bool "Preemptible UP-only small-memory-footprint RCU"
380 depends on !SMP && PREEMPT
381 help
382 This option selects the RCU implementation that is designed
383 for real-time UP systems. This option greatly reduces the
384 memory footprint of RCU.
385
375endchoice 386endchoice
376 387
388config PREEMPT_RCU
389 def_bool ( TREE_PREEMPT_RCU || TINY_PREEMPT_RCU )
390 help
391 This option enables preemptible-RCU code that is common between
392 the TREE_PREEMPT_RCU and TINY_PREEMPT_RCU implementations.
393
377config RCU_TRACE 394config RCU_TRACE
378 bool "Enable tracing for RCU" 395 bool "Enable tracing for RCU"
379 depends on TREE_RCU || TREE_PREEMPT_RCU 396 depends on TREE_RCU || TREE_PREEMPT_RCU
@@ -394,9 +411,12 @@ config RCU_FANOUT
394 help 411 help
395 This option controls the fanout of hierarchical implementations 412 This option controls the fanout of hierarchical implementations
396 of RCU, allowing RCU to work efficiently on machines with 413 of RCU, allowing RCU to work efficiently on machines with
397 large numbers of CPUs. This value must be at least the cube 414 large numbers of CPUs. This value must be at least the fourth
398 root of NR_CPUS, which allows NR_CPUS up to 32,768 for 32-bit 415 root of NR_CPUS, which allows NR_CPUS to be insanely large.
399 systems and up to 262,144 for 64-bit systems. 416 The default value of RCU_FANOUT should be used for production
417 systems, but if you are stress-testing the RCU implementation
418 itself, small RCU_FANOUT values allow you to test large-system
419 code paths on small(er) systems.
400 420
401 Select a specific number if testing RCU itself. 421 Select a specific number if testing RCU itself.
402 Take the default if unsure. 422 Take the default if unsure.
diff --git a/init/main.c b/init/main.c
index 94ab488039aa..9684c9670b48 100644
--- a/init/main.c
+++ b/init/main.c
@@ -556,7 +556,6 @@ asmlinkage void __init start_kernel(void)
556 556
557 local_irq_disable(); 557 local_irq_disable();
558 early_boot_irqs_off(); 558 early_boot_irqs_off();
559 early_init_irq_lock_class();
560 559
561/* 560/*
562 * Interrupts are still disabled. Do necessary setups, then 561 * Interrupts are still disabled. Do necessary setups, then
diff --git a/kernel/Makefile b/kernel/Makefile
index 4d9bf5f8531f..e2c9d52cfe9e 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -87,6 +87,7 @@ obj-$(CONFIG_TREE_RCU) += rcutree.o
87obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o 87obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
88obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o 88obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
89obj-$(CONFIG_TINY_RCU) += rcutiny.o 89obj-$(CONFIG_TINY_RCU) += rcutiny.o
90obj-$(CONFIG_TINY_PREEMPT_RCU) += rcutiny.o
90obj-$(CONFIG_RELAY) += relay.o 91obj-$(CONFIG_RELAY) += relay.o
91obj-$(CONFIG_SYSCTL) += utsname_sysctl.o 92obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
92obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o 93obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index c9483d8f6140..291ba3d04bea 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -138,7 +138,7 @@ struct css_id {
138 * is called after synchronize_rcu(). But for safe use, css_is_removed() 138 * is called after synchronize_rcu(). But for safe use, css_is_removed()
139 * css_tryget() should be used for avoiding race. 139 * css_tryget() should be used for avoiding race.
140 */ 140 */
141 struct cgroup_subsys_state *css; 141 struct cgroup_subsys_state __rcu *css;
142 /* 142 /*
143 * ID of this css. 143 * ID of this css.
144 */ 144 */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index b23c0979bbe7..51b143e2a07a 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1397,7 +1397,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1397 if (tsk->flags & PF_THREAD_BOUND) 1397 if (tsk->flags & PF_THREAD_BOUND)
1398 return -EINVAL; 1398 return -EINVAL;
1399 1399
1400 ret = security_task_setscheduler(tsk, 0, NULL); 1400 ret = security_task_setscheduler(tsk);
1401 if (ret) 1401 if (ret)
1402 return ret; 1402 return ret;
1403 if (threadgroup) { 1403 if (threadgroup) {
@@ -1405,7 +1405,7 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1405 1405
1406 rcu_read_lock(); 1406 rcu_read_lock();
1407 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { 1407 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
1408 ret = security_task_setscheduler(c, 0, NULL); 1408 ret = security_task_setscheduler(c);
1409 if (ret) { 1409 if (ret) {
1410 rcu_read_unlock(); 1410 rcu_read_unlock();
1411 return ret; 1411 return ret;
diff --git a/kernel/futex.c b/kernel/futex.c
index 6a3a5fa1526d..a118bf160e0b 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -91,6 +91,7 @@ struct futex_pi_state {
91 91
92/** 92/**
93 * struct futex_q - The hashed futex queue entry, one per waiting task 93 * struct futex_q - The hashed futex queue entry, one per waiting task
94 * @list: priority-sorted list of tasks waiting on this futex
94 * @task: the task waiting on the futex 95 * @task: the task waiting on the futex
95 * @lock_ptr: the hash bucket lock 96 * @lock_ptr: the hash bucket lock
96 * @key: the key the futex is hashed on 97 * @key: the key the futex is hashed on
@@ -104,7 +105,7 @@ struct futex_pi_state {
104 * 105 *
105 * A futex_q has a woken state, just like tasks have TASK_RUNNING. 106 * A futex_q has a woken state, just like tasks have TASK_RUNNING.
106 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. 107 * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
107 * The order of wakup is always to make the first condition true, then 108 * The order of wakeup is always to make the first condition true, then
108 * the second. 109 * the second.
109 * 110 *
110 * PI futexes are typically woken before they are removed from the hash list via 111 * PI futexes are typically woken before they are removed from the hash list via
@@ -295,7 +296,7 @@ void put_futex_key(int fshared, union futex_key *key)
295 * Slow path to fixup the fault we just took in the atomic write 296 * Slow path to fixup the fault we just took in the atomic write
296 * access to @uaddr. 297 * access to @uaddr.
297 * 298 *
298 * We have no generic implementation of a non destructive write to the 299 * We have no generic implementation of a non-destructive write to the
299 * user address. We know that we faulted in the atomic pagefault 300 * user address. We know that we faulted in the atomic pagefault
300 * disabled section so we can as well avoid the #PF overhead by 301 * disabled section so we can as well avoid the #PF overhead by
301 * calling get_user_pages() right away. 302 * calling get_user_pages() right away.
@@ -515,7 +516,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
515 */ 516 */
516 pi_state = this->pi_state; 517 pi_state = this->pi_state;
517 /* 518 /*
518 * Userspace might have messed up non PI and PI futexes 519 * Userspace might have messed up non-PI and PI futexes
519 */ 520 */
520 if (unlikely(!pi_state)) 521 if (unlikely(!pi_state))
521 return -EINVAL; 522 return -EINVAL;
@@ -736,8 +737,8 @@ static void wake_futex(struct futex_q *q)
736 737
737 /* 738 /*
738 * We set q->lock_ptr = NULL _before_ we wake up the task. If 739 * We set q->lock_ptr = NULL _before_ we wake up the task. If
739 * a non futex wake up happens on another CPU then the task 740 * a non-futex wake up happens on another CPU then the task
740 * might exit and p would dereference a non existing task 741 * might exit and p would dereference a non-existing task
741 * struct. Prevent this by holding a reference on p across the 742 * struct. Prevent this by holding a reference on p across the
742 * wake up. 743 * wake up.
743 */ 744 */
@@ -1131,11 +1132,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1131 1132
1132/** 1133/**
1133 * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 1134 * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
1134 * uaddr1: source futex user address 1135 * @uaddr1: source futex user address
1135 * uaddr2: target futex user address 1136 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
1136 * nr_wake: number of waiters to wake (must be 1 for requeue_pi) 1137 * @uaddr2: target futex user address
1137 * nr_requeue: number of waiters to requeue (0-INT_MAX) 1138 * @nr_wake: number of waiters to wake (must be 1 for requeue_pi)
1138 * requeue_pi: if we are attempting to requeue from a non-pi futex to a 1139 * @nr_requeue: number of waiters to requeue (0-INT_MAX)
1140 * @cmpval: @uaddr1 expected value (or %NULL)
1141 * @requeue_pi: if we are attempting to requeue from a non-pi futex to a
1139 * pi futex (pi to pi requeue is not supported) 1142 * pi futex (pi to pi requeue is not supported)
1140 * 1143 *
1141 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire 1144 * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
@@ -1360,10 +1363,10 @@ out:
1360 1363
1361/* The key must be already stored in q->key. */ 1364/* The key must be already stored in q->key. */
1362static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) 1365static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
1366 __acquires(&hb->lock)
1363{ 1367{
1364 struct futex_hash_bucket *hb; 1368 struct futex_hash_bucket *hb;
1365 1369
1366 get_futex_key_refs(&q->key);
1367 hb = hash_futex(&q->key); 1370 hb = hash_futex(&q->key);
1368 q->lock_ptr = &hb->lock; 1371 q->lock_ptr = &hb->lock;
1369 1372
@@ -1373,9 +1376,9 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
1373 1376
1374static inline void 1377static inline void
1375queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) 1378queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
1379 __releases(&hb->lock)
1376{ 1380{
1377 spin_unlock(&hb->lock); 1381 spin_unlock(&hb->lock);
1378 drop_futex_key_refs(&q->key);
1379} 1382}
1380 1383
1381/** 1384/**
@@ -1391,6 +1394,7 @@ queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
1391 * an example). 1394 * an example).
1392 */ 1395 */
1393static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) 1396static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
1397 __releases(&hb->lock)
1394{ 1398{
1395 int prio; 1399 int prio;
1396 1400
@@ -1471,6 +1475,7 @@ retry:
1471 * and dropped here. 1475 * and dropped here.
1472 */ 1476 */
1473static void unqueue_me_pi(struct futex_q *q) 1477static void unqueue_me_pi(struct futex_q *q)
1478 __releases(q->lock_ptr)
1474{ 1479{
1475 WARN_ON(plist_node_empty(&q->list)); 1480 WARN_ON(plist_node_empty(&q->list));
1476 plist_del(&q->list, &q->list.plist); 1481 plist_del(&q->list, &q->list.plist);
@@ -1480,8 +1485,6 @@ static void unqueue_me_pi(struct futex_q *q)
1480 q->pi_state = NULL; 1485 q->pi_state = NULL;
1481 1486
1482 spin_unlock(q->lock_ptr); 1487 spin_unlock(q->lock_ptr);
1483
1484 drop_futex_key_refs(&q->key);
1485} 1488}
1486 1489
1487/* 1490/*
@@ -1812,7 +1815,10 @@ static int futex_wait(u32 __user *uaddr, int fshared,
1812 } 1815 }
1813 1816
1814retry: 1817retry:
1815 /* Prepare to wait on uaddr. */ 1818 /*
1819 * Prepare to wait on uaddr. On success, holds hb lock and increments
1820 * q.key refs.
1821 */
1816 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); 1822 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
1817 if (ret) 1823 if (ret)
1818 goto out; 1824 goto out;
@@ -1822,28 +1828,27 @@ retry:
1822 1828
1823 /* If we were woken (and unqueued), we succeeded, whatever. */ 1829 /* If we were woken (and unqueued), we succeeded, whatever. */
1824 ret = 0; 1830 ret = 0;
1831 /* unqueue_me() drops q.key ref */
1825 if (!unqueue_me(&q)) 1832 if (!unqueue_me(&q))
1826 goto out_put_key; 1833 goto out;
1827 ret = -ETIMEDOUT; 1834 ret = -ETIMEDOUT;
1828 if (to && !to->task) 1835 if (to && !to->task)
1829 goto out_put_key; 1836 goto out;
1830 1837
1831 /* 1838 /*
1832 * We expect signal_pending(current), but we might be the 1839 * We expect signal_pending(current), but we might be the
1833 * victim of a spurious wakeup as well. 1840 * victim of a spurious wakeup as well.
1834 */ 1841 */
1835 if (!signal_pending(current)) { 1842 if (!signal_pending(current))
1836 put_futex_key(fshared, &q.key);
1837 goto retry; 1843 goto retry;
1838 }
1839 1844
1840 ret = -ERESTARTSYS; 1845 ret = -ERESTARTSYS;
1841 if (!abs_time) 1846 if (!abs_time)
1842 goto out_put_key; 1847 goto out;
1843 1848
1844 restart = &current_thread_info()->restart_block; 1849 restart = &current_thread_info()->restart_block;
1845 restart->fn = futex_wait_restart; 1850 restart->fn = futex_wait_restart;
1846 restart->futex.uaddr = (u32 *)uaddr; 1851 restart->futex.uaddr = uaddr;
1847 restart->futex.val = val; 1852 restart->futex.val = val;
1848 restart->futex.time = abs_time->tv64; 1853 restart->futex.time = abs_time->tv64;
1849 restart->futex.bitset = bitset; 1854 restart->futex.bitset = bitset;
@@ -1856,8 +1861,6 @@ retry:
1856 1861
1857 ret = -ERESTART_RESTARTBLOCK; 1862 ret = -ERESTART_RESTARTBLOCK;
1858 1863
1859out_put_key:
1860 put_futex_key(fshared, &q.key);
1861out: 1864out:
1862 if (to) { 1865 if (to) {
1863 hrtimer_cancel(&to->timer); 1866 hrtimer_cancel(&to->timer);
@@ -1869,7 +1872,7 @@ out:
1869 1872
1870static long futex_wait_restart(struct restart_block *restart) 1873static long futex_wait_restart(struct restart_block *restart)
1871{ 1874{
1872 u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; 1875 u32 __user *uaddr = restart->futex.uaddr;
1873 int fshared = 0; 1876 int fshared = 0;
1874 ktime_t t, *tp = NULL; 1877 ktime_t t, *tp = NULL;
1875 1878
@@ -2236,7 +2239,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2236 q.rt_waiter = &rt_waiter; 2239 q.rt_waiter = &rt_waiter;
2237 q.requeue_pi_key = &key2; 2240 q.requeue_pi_key = &key2;
2238 2241
2239 /* Prepare to wait on uaddr. */ 2242 /*
2243 * Prepare to wait on uaddr. On success, increments q.key (key1) ref
2244 * count.
2245 */
2240 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); 2246 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
2241 if (ret) 2247 if (ret)
2242 goto out_key2; 2248 goto out_key2;
@@ -2254,7 +2260,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2254 * In order for us to be here, we know our q.key == key2, and since 2260 * In order for us to be here, we know our q.key == key2, and since
2255 * we took the hb->lock above, we also know that futex_requeue() has 2261 * we took the hb->lock above, we also know that futex_requeue() has
2256 * completed and we no longer have to concern ourselves with a wakeup 2262 * completed and we no longer have to concern ourselves with a wakeup
2257 * race with the atomic proxy lock acquition by the requeue code. 2263 * race with the atomic proxy lock acquisition by the requeue code. The
2264 * futex_requeue dropped our key1 reference and incremented our key2
2265 * reference count.
2258 */ 2266 */
2259 2267
2260 /* Check if the requeue code acquired the second futex for us. */ 2268 /* Check if the requeue code acquired the second futex for us. */
@@ -2458,7 +2466,7 @@ retry:
2458 */ 2466 */
2459static inline int fetch_robust_entry(struct robust_list __user **entry, 2467static inline int fetch_robust_entry(struct robust_list __user **entry,
2460 struct robust_list __user * __user *head, 2468 struct robust_list __user * __user *head,
2461 int *pi) 2469 unsigned int *pi)
2462{ 2470{
2463 unsigned long uentry; 2471 unsigned long uentry;
2464 2472
@@ -2647,7 +2655,7 @@ static int __init futex_init(void)
2647 * of the complex code paths. Also we want to prevent 2655 * of the complex code paths. Also we want to prevent
2648 * registration of robust lists in that case. NULL is 2656 * registration of robust lists in that case. NULL is
2649 * guaranteed to fault and we get -EFAULT on functional 2657 * guaranteed to fault and we get -EFAULT on functional
2650 * implementation, the non functional ones will return 2658 * implementation, the non-functional ones will return
2651 * -ENOSYS. 2659 * -ENOSYS.
2652 */ 2660 */
2653 curval = cmpxchg_futex_value_locked(NULL, 0, 0); 2661 curval = cmpxchg_futex_value_locked(NULL, 0, 0);
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index d49afb2395e5..06da4dfc339b 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -19,7 +19,7 @@
19 */ 19 */
20static inline int 20static inline int
21fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, 21fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
22 compat_uptr_t __user *head, int *pi) 22 compat_uptr_t __user *head, unsigned int *pi)
23{ 23{
24 if (get_user(*uentry, head)) 24 if (get_user(*uentry, head))
25 return -EFAULT; 25 return -EFAULT;
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 0c642d51aac2..53ead174da2f 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -98,7 +98,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
98 printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" 98 printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
99 " disables this message.\n"); 99 " disables this message.\n");
100 sched_show_task(t); 100 sched_show_task(t);
101 __debug_show_held_locks(t); 101 debug_show_held_locks(t);
102 102
103 touch_nmi_watchdog(); 103 touch_nmi_watchdog();
104 104
@@ -111,7 +111,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
111 * periodically exit the critical section and enter a new one. 111 * periodically exit the critical section and enter a new one.
112 * 112 *
113 * For preemptible RCU it is sufficient to call rcu_read_unlock in order 113 * For preemptible RCU it is sufficient to call rcu_read_unlock in order
114 * exit the grace period. For classic RCU, a reschedule is required. 114 * to exit the grace period. For classic RCU, a reschedule is required.
115 */ 115 */
116static void rcu_lock_break(struct task_struct *g, struct task_struct *t) 116static void rcu_lock_break(struct task_struct *g, struct task_struct *t)
117{ 117{
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
new file mode 100644
index 000000000000..31d766bf5d2e
--- /dev/null
+++ b/kernel/irq/Kconfig
@@ -0,0 +1,53 @@
1config HAVE_GENERIC_HARDIRQS
2 def_bool n
3
4if HAVE_GENERIC_HARDIRQS
5menu "IRQ subsystem"
6#
7# Interrupt subsystem related configuration options
8#
9config GENERIC_HARDIRQS
10 def_bool y
11
12config GENERIC_HARDIRQS_NO__DO_IRQ
13 def_bool y
14
15# Select this to disable the deprecated stuff
16config GENERIC_HARDIRQS_NO_DEPRECATED
17 def_bool n
18
19# Options selectable by the architecture code
20config HAVE_SPARSE_IRQ
21 def_bool n
22
23config GENERIC_IRQ_PROBE
24 def_bool n
25
26config GENERIC_PENDING_IRQ
27 def_bool n
28
29config AUTO_IRQ_AFFINITY
30 def_bool n
31
32config IRQ_PER_CPU
33 def_bool n
34
35config HARDIRQS_SW_RESEND
36 def_bool n
37
38config SPARSE_IRQ
39 bool "Support sparse irq numbering"
40 depends on HAVE_SPARSE_IRQ
41 ---help---
42
43 Sparse irq numbering is useful for distro kernels that want
44 to define a high CONFIG_NR_CPUS value but still want to have
45 low kernel memory footprint on smaller machines.
46
47 ( Sparse irqs can also be beneficial on NUMA boxes, as they spread
48 out the interrupt descriptors in a more NUMA-friendly way. )
49
50 If you don't know what to do here, say N.
51
52endmenu
53endif
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 7d047808419d..54329cd7b3ee 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -1,7 +1,6 @@
1 1
2obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o 2obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o
3obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o 3obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
4obj-$(CONFIG_PROC_FS) += proc.o 4obj-$(CONFIG_PROC_FS) += proc.o
5obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o 5obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
6obj-$(CONFIG_NUMA_IRQ_DESC) += numa_migrate.o
7obj-$(CONFIG_PM_SLEEP) += pm.o 6obj-$(CONFIG_PM_SLEEP) += pm.o
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 2295a31ef110..505798f86c36 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -57,9 +57,10 @@ unsigned long probe_irq_on(void)
57 * Some chips need to know about probing in 57 * Some chips need to know about probing in
58 * progress: 58 * progress:
59 */ 59 */
60 if (desc->chip->set_type) 60 if (desc->irq_data.chip->irq_set_type)
61 desc->chip->set_type(i, IRQ_TYPE_PROBE); 61 desc->irq_data.chip->irq_set_type(&desc->irq_data,
62 desc->chip->startup(i); 62 IRQ_TYPE_PROBE);
63 desc->irq_data.chip->irq_startup(&desc->irq_data);
63 } 64 }
64 raw_spin_unlock_irq(&desc->lock); 65 raw_spin_unlock_irq(&desc->lock);
65 } 66 }
@@ -76,7 +77,7 @@ unsigned long probe_irq_on(void)
76 raw_spin_lock_irq(&desc->lock); 77 raw_spin_lock_irq(&desc->lock);
77 if (!desc->action && !(desc->status & IRQ_NOPROBE)) { 78 if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
78 desc->status |= IRQ_AUTODETECT | IRQ_WAITING; 79 desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
79 if (desc->chip->startup(i)) 80 if (desc->irq_data.chip->irq_startup(&desc->irq_data))
80 desc->status |= IRQ_PENDING; 81 desc->status |= IRQ_PENDING;
81 } 82 }
82 raw_spin_unlock_irq(&desc->lock); 83 raw_spin_unlock_irq(&desc->lock);
@@ -98,7 +99,7 @@ unsigned long probe_irq_on(void)
98 /* It triggered already - consider it spurious. */ 99 /* It triggered already - consider it spurious. */
99 if (!(status & IRQ_WAITING)) { 100 if (!(status & IRQ_WAITING)) {
100 desc->status = status & ~IRQ_AUTODETECT; 101 desc->status = status & ~IRQ_AUTODETECT;
101 desc->chip->shutdown(i); 102 desc->irq_data.chip->irq_shutdown(&desc->irq_data);
102 } else 103 } else
103 if (i < 32) 104 if (i < 32)
104 mask |= 1 << i; 105 mask |= 1 << i;
@@ -137,7 +138,7 @@ unsigned int probe_irq_mask(unsigned long val)
137 mask |= 1 << i; 138 mask |= 1 << i;
138 139
139 desc->status = status & ~IRQ_AUTODETECT; 140 desc->status = status & ~IRQ_AUTODETECT;
140 desc->chip->shutdown(i); 141 desc->irq_data.chip->irq_shutdown(&desc->irq_data);
141 } 142 }
142 raw_spin_unlock_irq(&desc->lock); 143 raw_spin_unlock_irq(&desc->lock);
143 } 144 }
@@ -181,7 +182,7 @@ int probe_irq_off(unsigned long val)
181 nr_of_irqs++; 182 nr_of_irqs++;
182 } 183 }
183 desc->status = status & ~IRQ_AUTODETECT; 184 desc->status = status & ~IRQ_AUTODETECT;
184 desc->chip->shutdown(i); 185 desc->irq_data.chip->irq_shutdown(&desc->irq_data);
185 } 186 }
186 raw_spin_unlock_irq(&desc->lock); 187 raw_spin_unlock_irq(&desc->lock);
187 } 188 }
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index b7091d5ca2f8..baa5c4acad83 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -18,108 +18,6 @@
18 18
19#include "internals.h" 19#include "internals.h"
20 20
21static void dynamic_irq_init_x(unsigned int irq, bool keep_chip_data)
22{
23 struct irq_desc *desc;
24 unsigned long flags;
25
26 desc = irq_to_desc(irq);
27 if (!desc) {
28 WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq);
29 return;
30 }
31
32 /* Ensure we don't have left over values from a previous use of this irq */
33 raw_spin_lock_irqsave(&desc->lock, flags);
34 desc->status = IRQ_DISABLED;
35 desc->chip = &no_irq_chip;
36 desc->handle_irq = handle_bad_irq;
37 desc->depth = 1;
38 desc->msi_desc = NULL;
39 desc->handler_data = NULL;
40 if (!keep_chip_data)
41 desc->chip_data = NULL;
42 desc->action = NULL;
43 desc->irq_count = 0;
44 desc->irqs_unhandled = 0;
45#ifdef CONFIG_SMP
46 cpumask_setall(desc->affinity);
47#ifdef CONFIG_GENERIC_PENDING_IRQ
48 cpumask_clear(desc->pending_mask);
49#endif
50#endif
51 raw_spin_unlock_irqrestore(&desc->lock, flags);
52}
53
54/**
55 * dynamic_irq_init - initialize a dynamically allocated irq
56 * @irq: irq number to initialize
57 */
58void dynamic_irq_init(unsigned int irq)
59{
60 dynamic_irq_init_x(irq, false);
61}
62
63/**
64 * dynamic_irq_init_keep_chip_data - initialize a dynamically allocated irq
65 * @irq: irq number to initialize
66 *
67 * does not set irq_to_desc(irq)->chip_data to NULL
68 */
69void dynamic_irq_init_keep_chip_data(unsigned int irq)
70{
71 dynamic_irq_init_x(irq, true);
72}
73
74static void dynamic_irq_cleanup_x(unsigned int irq, bool keep_chip_data)
75{
76 struct irq_desc *desc = irq_to_desc(irq);
77 unsigned long flags;
78
79 if (!desc) {
80 WARN(1, KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq);
81 return;
82 }
83
84 raw_spin_lock_irqsave(&desc->lock, flags);
85 if (desc->action) {
86 raw_spin_unlock_irqrestore(&desc->lock, flags);
87 WARN(1, KERN_ERR "Destroying IRQ%d without calling free_irq\n",
88 irq);
89 return;
90 }
91 desc->msi_desc = NULL;
92 desc->handler_data = NULL;
93 if (!keep_chip_data)
94 desc->chip_data = NULL;
95 desc->handle_irq = handle_bad_irq;
96 desc->chip = &no_irq_chip;
97 desc->name = NULL;
98 clear_kstat_irqs(desc);
99 raw_spin_unlock_irqrestore(&desc->lock, flags);
100}
101
102/**
103 * dynamic_irq_cleanup - cleanup a dynamically allocated irq
104 * @irq: irq number to initialize
105 */
106void dynamic_irq_cleanup(unsigned int irq)
107{
108 dynamic_irq_cleanup_x(irq, false);
109}
110
111/**
112 * dynamic_irq_cleanup_keep_chip_data - cleanup a dynamically allocated irq
113 * @irq: irq number to initialize
114 *
115 * does not set irq_to_desc(irq)->chip_data to NULL
116 */
117void dynamic_irq_cleanup_keep_chip_data(unsigned int irq)
118{
119 dynamic_irq_cleanup_x(irq, true);
120}
121
122
123/** 21/**
124 * set_irq_chip - set the irq chip for an irq 22 * set_irq_chip - set the irq chip for an irq
125 * @irq: irq number 23 * @irq: irq number
@@ -140,7 +38,7 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip)
140 38
141 raw_spin_lock_irqsave(&desc->lock, flags); 39 raw_spin_lock_irqsave(&desc->lock, flags);
142 irq_chip_set_defaults(chip); 40 irq_chip_set_defaults(chip);
143 desc->chip = chip; 41 desc->irq_data.chip = chip;
144 raw_spin_unlock_irqrestore(&desc->lock, flags); 42 raw_spin_unlock_irqrestore(&desc->lock, flags);
145 43
146 return 0; 44 return 0;
@@ -193,7 +91,7 @@ int set_irq_data(unsigned int irq, void *data)
193 } 91 }
194 92
195 raw_spin_lock_irqsave(&desc->lock, flags); 93 raw_spin_lock_irqsave(&desc->lock, flags);
196 desc->handler_data = data; 94 desc->irq_data.handler_data = data;
197 raw_spin_unlock_irqrestore(&desc->lock, flags); 95 raw_spin_unlock_irqrestore(&desc->lock, flags);
198 return 0; 96 return 0;
199} 97}
@@ -218,7 +116,7 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry)
218 } 116 }
219 117
220 raw_spin_lock_irqsave(&desc->lock, flags); 118 raw_spin_lock_irqsave(&desc->lock, flags);
221 desc->msi_desc = entry; 119 desc->irq_data.msi_desc = entry;
222 if (entry) 120 if (entry)
223 entry->irq = irq; 121 entry->irq = irq;
224 raw_spin_unlock_irqrestore(&desc->lock, flags); 122 raw_spin_unlock_irqrestore(&desc->lock, flags);
@@ -243,19 +141,27 @@ int set_irq_chip_data(unsigned int irq, void *data)
243 return -EINVAL; 141 return -EINVAL;
244 } 142 }
245 143
246 if (!desc->chip) { 144 if (!desc->irq_data.chip) {
247 printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq); 145 printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq);
248 return -EINVAL; 146 return -EINVAL;
249 } 147 }
250 148
251 raw_spin_lock_irqsave(&desc->lock, flags); 149 raw_spin_lock_irqsave(&desc->lock, flags);
252 desc->chip_data = data; 150 desc->irq_data.chip_data = data;
253 raw_spin_unlock_irqrestore(&desc->lock, flags); 151 raw_spin_unlock_irqrestore(&desc->lock, flags);
254 152
255 return 0; 153 return 0;
256} 154}
257EXPORT_SYMBOL(set_irq_chip_data); 155EXPORT_SYMBOL(set_irq_chip_data);
258 156
157struct irq_data *irq_get_irq_data(unsigned int irq)
158{
159 struct irq_desc *desc = irq_to_desc(irq);
160
161 return desc ? &desc->irq_data : NULL;
162}
163EXPORT_SYMBOL_GPL(irq_get_irq_data);
164
259/** 165/**
260 * set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq 166 * set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq
261 * 167 *
@@ -287,93 +193,216 @@ EXPORT_SYMBOL_GPL(set_irq_nested_thread);
287/* 193/*
288 * default enable function 194 * default enable function
289 */ 195 */
290static void default_enable(unsigned int irq) 196static void default_enable(struct irq_data *data)
291{ 197{
292 struct irq_desc *desc = irq_to_desc(irq); 198 struct irq_desc *desc = irq_data_to_desc(data);
293 199
294 desc->chip->unmask(irq); 200 desc->irq_data.chip->irq_unmask(&desc->irq_data);
295 desc->status &= ~IRQ_MASKED; 201 desc->status &= ~IRQ_MASKED;
296} 202}
297 203
298/* 204/*
299 * default disable function 205 * default disable function
300 */ 206 */
301static void default_disable(unsigned int irq) 207static void default_disable(struct irq_data *data)
302{ 208{
303} 209}
304 210
305/* 211/*
306 * default startup function 212 * default startup function
307 */ 213 */
308static unsigned int default_startup(unsigned int irq) 214static unsigned int default_startup(struct irq_data *data)
309{ 215{
310 struct irq_desc *desc = irq_to_desc(irq); 216 struct irq_desc *desc = irq_data_to_desc(data);
311 217
312 desc->chip->enable(irq); 218 desc->irq_data.chip->irq_enable(data);
313 return 0; 219 return 0;
314} 220}
315 221
316/* 222/*
317 * default shutdown function 223 * default shutdown function
318 */ 224 */
319static void default_shutdown(unsigned int irq) 225static void default_shutdown(struct irq_data *data)
320{ 226{
321 struct irq_desc *desc = irq_to_desc(irq); 227 struct irq_desc *desc = irq_data_to_desc(data);
322 228
323 desc->chip->mask(irq); 229 desc->irq_data.chip->irq_mask(&desc->irq_data);
324 desc->status |= IRQ_MASKED; 230 desc->status |= IRQ_MASKED;
325} 231}
326 232
233#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
234/* Temporary migration helpers */
235static void compat_irq_mask(struct irq_data *data)
236{
237 data->chip->mask(data->irq);
238}
239
240static void compat_irq_unmask(struct irq_data *data)
241{
242 data->chip->unmask(data->irq);
243}
244
245static void compat_irq_ack(struct irq_data *data)
246{
247 data->chip->ack(data->irq);
248}
249
250static void compat_irq_mask_ack(struct irq_data *data)
251{
252 data->chip->mask_ack(data->irq);
253}
254
255static void compat_irq_eoi(struct irq_data *data)
256{
257 data->chip->eoi(data->irq);
258}
259
260static void compat_irq_enable(struct irq_data *data)
261{
262 data->chip->enable(data->irq);
263}
264
265static void compat_irq_disable(struct irq_data *data)
266{
267 data->chip->disable(data->irq);
268}
269
270static void compat_irq_shutdown(struct irq_data *data)
271{
272 data->chip->shutdown(data->irq);
273}
274
275static unsigned int compat_irq_startup(struct irq_data *data)
276{
277 return data->chip->startup(data->irq);
278}
279
280static int compat_irq_set_affinity(struct irq_data *data,
281 const struct cpumask *dest, bool force)
282{
283 return data->chip->set_affinity(data->irq, dest);
284}
285
286static int compat_irq_set_type(struct irq_data *data, unsigned int type)
287{
288 return data->chip->set_type(data->irq, type);
289}
290
291static int compat_irq_set_wake(struct irq_data *data, unsigned int on)
292{
293 return data->chip->set_wake(data->irq, on);
294}
295
296static int compat_irq_retrigger(struct irq_data *data)
297{
298 return data->chip->retrigger(data->irq);
299}
300
301static void compat_bus_lock(struct irq_data *data)
302{
303 data->chip->bus_lock(data->irq);
304}
305
306static void compat_bus_sync_unlock(struct irq_data *data)
307{
308 data->chip->bus_sync_unlock(data->irq);
309}
310#endif
311
327/* 312/*
328 * Fixup enable/disable function pointers 313 * Fixup enable/disable function pointers
329 */ 314 */
330void irq_chip_set_defaults(struct irq_chip *chip) 315void irq_chip_set_defaults(struct irq_chip *chip)
331{ 316{
332 if (!chip->enable) 317#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
333 chip->enable = default_enable;
334 if (!chip->disable)
335 chip->disable = default_disable;
336 if (!chip->startup)
337 chip->startup = default_startup;
338 /* 318 /*
339 * We use chip->disable, when the user provided its own. When 319 * Compat fixup functions need to be before we set the
340 * we have default_disable set for chip->disable, then we need 320 * defaults for enable/disable/startup/shutdown
321 */
322 if (chip->enable)
323 chip->irq_enable = compat_irq_enable;
324 if (chip->disable)
325 chip->irq_disable = compat_irq_disable;
326 if (chip->shutdown)
327 chip->irq_shutdown = compat_irq_shutdown;
328 if (chip->startup)
329 chip->irq_startup = compat_irq_startup;
330#endif
331 /*
332 * The real defaults
333 */
334 if (!chip->irq_enable)
335 chip->irq_enable = default_enable;
336 if (!chip->irq_disable)
337 chip->irq_disable = default_disable;
338 if (!chip->irq_startup)
339 chip->irq_startup = default_startup;
340 /*
341 * We use chip->irq_disable, when the user provided its own. When
342 * we have default_disable set for chip->irq_disable, then we need
341 * to use default_shutdown, otherwise the irq line is not 343 * to use default_shutdown, otherwise the irq line is not
342 * disabled on free_irq(): 344 * disabled on free_irq():
343 */ 345 */
344 if (!chip->shutdown) 346 if (!chip->irq_shutdown)
345 chip->shutdown = chip->disable != default_disable ? 347 chip->irq_shutdown = chip->irq_disable != default_disable ?
346 chip->disable : default_shutdown; 348 chip->irq_disable : default_shutdown;
347 if (!chip->name) 349
348 chip->name = chip->typename; 350#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
349 if (!chip->end) 351 if (!chip->end)
350 chip->end = dummy_irq_chip.end; 352 chip->end = dummy_irq_chip.end;
353
354 /*
355 * Now fix up the remaining compat handlers
356 */
357 if (chip->bus_lock)
358 chip->irq_bus_lock = compat_bus_lock;
359 if (chip->bus_sync_unlock)
360 chip->irq_bus_sync_unlock = compat_bus_sync_unlock;
361 if (chip->mask)
362 chip->irq_mask = compat_irq_mask;
363 if (chip->unmask)
364 chip->irq_unmask = compat_irq_unmask;
365 if (chip->ack)
366 chip->irq_ack = compat_irq_ack;
367 if (chip->mask_ack)
368 chip->irq_mask_ack = compat_irq_mask_ack;
369 if (chip->eoi)
370 chip->irq_eoi = compat_irq_eoi;
371 if (chip->set_affinity)
372 chip->irq_set_affinity = compat_irq_set_affinity;
373 if (chip->set_type)
374 chip->irq_set_type = compat_irq_set_type;
375 if (chip->set_wake)
376 chip->irq_set_wake = compat_irq_set_wake;
377 if (chip->retrigger)
378 chip->irq_retrigger = compat_irq_retrigger;
379#endif
351} 380}
352 381
353static inline void mask_ack_irq(struct irq_desc *desc, int irq) 382static inline void mask_ack_irq(struct irq_desc *desc)
354{ 383{
355 if (desc->chip->mask_ack) 384 if (desc->irq_data.chip->irq_mask_ack)
356 desc->chip->mask_ack(irq); 385 desc->irq_data.chip->irq_mask_ack(&desc->irq_data);
357 else { 386 else {
358 desc->chip->mask(irq); 387 desc->irq_data.chip->irq_mask(&desc->irq_data);
359 if (desc->chip->ack) 388 if (desc->irq_data.chip->irq_ack)
360 desc->chip->ack(irq); 389 desc->irq_data.chip->irq_ack(&desc->irq_data);
361 } 390 }
362 desc->status |= IRQ_MASKED; 391 desc->status |= IRQ_MASKED;
363} 392}
364 393
365static inline void mask_irq(struct irq_desc *desc, int irq) 394static inline void mask_irq(struct irq_desc *desc)
366{ 395{
367 if (desc->chip->mask) { 396 if (desc->irq_data.chip->irq_mask) {
368 desc->chip->mask(irq); 397 desc->irq_data.chip->irq_mask(&desc->irq_data);
369 desc->status |= IRQ_MASKED; 398 desc->status |= IRQ_MASKED;
370 } 399 }
371} 400}
372 401
373static inline void unmask_irq(struct irq_desc *desc, int irq) 402static inline void unmask_irq(struct irq_desc *desc)
374{ 403{
375 if (desc->chip->unmask) { 404 if (desc->irq_data.chip->irq_unmask) {
376 desc->chip->unmask(irq); 405 desc->irq_data.chip->irq_unmask(&desc->irq_data);
377 desc->status &= ~IRQ_MASKED; 406 desc->status &= ~IRQ_MASKED;
378 } 407 }
379} 408}
@@ -476,7 +505,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
476 irqreturn_t action_ret; 505 irqreturn_t action_ret;
477 506
478 raw_spin_lock(&desc->lock); 507 raw_spin_lock(&desc->lock);
479 mask_ack_irq(desc, irq); 508 mask_ack_irq(desc);
480 509
481 if (unlikely(desc->status & IRQ_INPROGRESS)) 510 if (unlikely(desc->status & IRQ_INPROGRESS))
482 goto out_unlock; 511 goto out_unlock;
@@ -502,7 +531,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
502 desc->status &= ~IRQ_INPROGRESS; 531 desc->status &= ~IRQ_INPROGRESS;
503 532
504 if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT))) 533 if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT)))
505 unmask_irq(desc, irq); 534 unmask_irq(desc);
506out_unlock: 535out_unlock:
507 raw_spin_unlock(&desc->lock); 536 raw_spin_unlock(&desc->lock);
508} 537}
@@ -539,7 +568,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
539 action = desc->action; 568 action = desc->action;
540 if (unlikely(!action || (desc->status & IRQ_DISABLED))) { 569 if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
541 desc->status |= IRQ_PENDING; 570 desc->status |= IRQ_PENDING;
542 mask_irq(desc, irq); 571 mask_irq(desc);
543 goto out; 572 goto out;
544 } 573 }
545 574
@@ -554,7 +583,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
554 raw_spin_lock(&desc->lock); 583 raw_spin_lock(&desc->lock);
555 desc->status &= ~IRQ_INPROGRESS; 584 desc->status &= ~IRQ_INPROGRESS;
556out: 585out:
557 desc->chip->eoi(irq); 586 desc->irq_data.chip->irq_eoi(&desc->irq_data);
558 587
559 raw_spin_unlock(&desc->lock); 588 raw_spin_unlock(&desc->lock);
560} 589}
@@ -590,14 +619,13 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
590 if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) || 619 if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) ||
591 !desc->action)) { 620 !desc->action)) {
592 desc->status |= (IRQ_PENDING | IRQ_MASKED); 621 desc->status |= (IRQ_PENDING | IRQ_MASKED);
593 mask_ack_irq(desc, irq); 622 mask_ack_irq(desc);
594 goto out_unlock; 623 goto out_unlock;
595 } 624 }
596 kstat_incr_irqs_this_cpu(irq, desc); 625 kstat_incr_irqs_this_cpu(irq, desc);
597 626
598 /* Start handling the irq */ 627 /* Start handling the irq */
599 if (desc->chip->ack) 628 desc->irq_data.chip->irq_ack(&desc->irq_data);
600 desc->chip->ack(irq);
601 629
602 /* Mark the IRQ currently in progress.*/ 630 /* Mark the IRQ currently in progress.*/
603 desc->status |= IRQ_INPROGRESS; 631 desc->status |= IRQ_INPROGRESS;
@@ -607,7 +635,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
607 irqreturn_t action_ret; 635 irqreturn_t action_ret;
608 636
609 if (unlikely(!action)) { 637 if (unlikely(!action)) {
610 mask_irq(desc, irq); 638 mask_irq(desc);
611 goto out_unlock; 639 goto out_unlock;
612 } 640 }
613 641
@@ -619,7 +647,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
619 if (unlikely((desc->status & 647 if (unlikely((desc->status &
620 (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) == 648 (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) ==
621 (IRQ_PENDING | IRQ_MASKED))) { 649 (IRQ_PENDING | IRQ_MASKED))) {
622 unmask_irq(desc, irq); 650 unmask_irq(desc);
623 } 651 }
624 652
625 desc->status &= ~IRQ_PENDING; 653 desc->status &= ~IRQ_PENDING;
@@ -650,15 +678,15 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
650 678
651 kstat_incr_irqs_this_cpu(irq, desc); 679 kstat_incr_irqs_this_cpu(irq, desc);
652 680
653 if (desc->chip->ack) 681 if (desc->irq_data.chip->irq_ack)
654 desc->chip->ack(irq); 682 desc->irq_data.chip->irq_ack(&desc->irq_data);
655 683
656 action_ret = handle_IRQ_event(irq, desc->action); 684 action_ret = handle_IRQ_event(irq, desc->action);
657 if (!noirqdebug) 685 if (!noirqdebug)
658 note_interrupt(irq, desc, action_ret); 686 note_interrupt(irq, desc, action_ret);
659 687
660 if (desc->chip->eoi) 688 if (desc->irq_data.chip->irq_eoi)
661 desc->chip->eoi(irq); 689 desc->irq_data.chip->irq_eoi(&desc->irq_data);
662} 690}
663 691
664void 692void
@@ -676,7 +704,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
676 704
677 if (!handle) 705 if (!handle)
678 handle = handle_bad_irq; 706 handle = handle_bad_irq;
679 else if (desc->chip == &no_irq_chip) { 707 else if (desc->irq_data.chip == &no_irq_chip) {
680 printk(KERN_WARNING "Trying to install %sinterrupt handler " 708 printk(KERN_WARNING "Trying to install %sinterrupt handler "
681 "for IRQ%d\n", is_chained ? "chained " : "", irq); 709 "for IRQ%d\n", is_chained ? "chained " : "", irq);
682 /* 710 /*
@@ -686,16 +714,16 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
686 * prevent us to setup the interrupt at all. Switch it to 714 * prevent us to setup the interrupt at all. Switch it to
687 * dummy_irq_chip for easy transition. 715 * dummy_irq_chip for easy transition.
688 */ 716 */
689 desc->chip = &dummy_irq_chip; 717 desc->irq_data.chip = &dummy_irq_chip;
690 } 718 }
691 719
692 chip_bus_lock(irq, desc); 720 chip_bus_lock(desc);
693 raw_spin_lock_irqsave(&desc->lock, flags); 721 raw_spin_lock_irqsave(&desc->lock, flags);
694 722
695 /* Uninstall? */ 723 /* Uninstall? */
696 if (handle == handle_bad_irq) { 724 if (handle == handle_bad_irq) {
697 if (desc->chip != &no_irq_chip) 725 if (desc->irq_data.chip != &no_irq_chip)
698 mask_ack_irq(desc, irq); 726 mask_ack_irq(desc);
699 desc->status |= IRQ_DISABLED; 727 desc->status |= IRQ_DISABLED;
700 desc->depth = 1; 728 desc->depth = 1;
701 } 729 }
@@ -706,10 +734,10 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
706 desc->status &= ~IRQ_DISABLED; 734 desc->status &= ~IRQ_DISABLED;
707 desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE; 735 desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE;
708 desc->depth = 0; 736 desc->depth = 0;
709 desc->chip->startup(irq); 737 desc->irq_data.chip->irq_startup(&desc->irq_data);
710 } 738 }
711 raw_spin_unlock_irqrestore(&desc->lock, flags); 739 raw_spin_unlock_irqrestore(&desc->lock, flags);
712 chip_bus_sync_unlock(irq, desc); 740 chip_bus_sync_unlock(desc);
713} 741}
714EXPORT_SYMBOL_GPL(__set_irq_handler); 742EXPORT_SYMBOL_GPL(__set_irq_handler);
715 743
@@ -729,32 +757,20 @@ set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
729 __set_irq_handler(irq, handle, 0, name); 757 __set_irq_handler(irq, handle, 0, name);
730} 758}
731 759
732void set_irq_noprobe(unsigned int irq) 760void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
733{ 761{
734 struct irq_desc *desc = irq_to_desc(irq); 762 struct irq_desc *desc = irq_to_desc(irq);
735 unsigned long flags; 763 unsigned long flags;
736 764
737 if (!desc) { 765 if (!desc)
738 printk(KERN_ERR "Trying to mark IRQ%d non-probeable\n", irq);
739 return; 766 return;
740 }
741
742 raw_spin_lock_irqsave(&desc->lock, flags);
743 desc->status |= IRQ_NOPROBE;
744 raw_spin_unlock_irqrestore(&desc->lock, flags);
745}
746
747void set_irq_probe(unsigned int irq)
748{
749 struct irq_desc *desc = irq_to_desc(irq);
750 unsigned long flags;
751 767
752 if (!desc) { 768 /* Sanitize flags */
753 printk(KERN_ERR "Trying to mark IRQ%d probeable\n", irq); 769 set &= IRQF_MODIFY_MASK;
754 return; 770 clr &= IRQF_MODIFY_MASK;
755 }
756 771
757 raw_spin_lock_irqsave(&desc->lock, flags); 772 raw_spin_lock_irqsave(&desc->lock, flags);
758 desc->status &= ~IRQ_NOPROBE; 773 desc->status &= ~clr;
774 desc->status |= set;
759 raw_spin_unlock_irqrestore(&desc->lock, flags); 775 raw_spin_unlock_irqrestore(&desc->lock, flags);
760} 776}
diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c
new file mode 100644
index 000000000000..20dc5474947e
--- /dev/null
+++ b/kernel/irq/dummychip.c
@@ -0,0 +1,68 @@
1/*
2 * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
3 * Copyright (C) 2005-2006, Thomas Gleixner, Russell King
4 *
5 * This file contains the dummy interrupt chip implementation
6 */
7#include <linux/interrupt.h>
8#include <linux/irq.h>
9
10#include "internals.h"
11
12/*
13 * What should we do if we get a hw irq event on an illegal vector?
14 * Each architecture has to answer this themself.
15 */
16static void ack_bad(struct irq_data *data)
17{
18 struct irq_desc *desc = irq_data_to_desc(data);
19
20 print_irq_desc(data->irq, desc);
21 ack_bad_irq(data->irq);
22}
23
24/*
25 * NOP functions
26 */
27static void noop(struct irq_data *data) { }
28
29static unsigned int noop_ret(struct irq_data *data)
30{
31 return 0;
32}
33
34#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
35static void compat_noop(unsigned int irq) { }
36#define END_INIT .end = compat_noop
37#else
38#define END_INIT
39#endif
40
41/*
42 * Generic no controller implementation
43 */
44struct irq_chip no_irq_chip = {
45 .name = "none",
46 .irq_startup = noop_ret,
47 .irq_shutdown = noop,
48 .irq_enable = noop,
49 .irq_disable = noop,
50 .irq_ack = ack_bad,
51 END_INIT
52};
53
54/*
55 * Generic dummy implementation which can be used for
56 * real dumb interrupt sources
57 */
58struct irq_chip dummy_irq_chip = {
59 .name = "dummy",
60 .irq_startup = noop_ret,
61 .irq_shutdown = noop,
62 .irq_enable = noop,
63 .irq_disable = noop,
64 .irq_ack = noop,
65 .irq_mask = noop,
66 .irq_unmask = noop,
67 END_INIT
68};
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 27e5c6911223..e2347eb63306 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -11,24 +11,15 @@
11 */ 11 */
12 12
13#include <linux/irq.h> 13#include <linux/irq.h>
14#include <linux/sched.h>
15#include <linux/slab.h>
16#include <linux/module.h>
17#include <linux/random.h> 14#include <linux/random.h>
15#include <linux/sched.h>
18#include <linux/interrupt.h> 16#include <linux/interrupt.h>
19#include <linux/kernel_stat.h> 17#include <linux/kernel_stat.h>
20#include <linux/rculist.h> 18
21#include <linux/hash.h>
22#include <linux/radix-tree.h>
23#include <trace/events/irq.h> 19#include <trace/events/irq.h>
24 20
25#include "internals.h" 21#include "internals.h"
26 22
27/*
28 * lockdep: we want to handle all irq_desc locks as a single lock-class:
29 */
30struct lock_class_key irq_desc_lock_class;
31
32/** 23/**
33 * handle_bad_irq - handle spurious and unhandled irqs 24 * handle_bad_irq - handle spurious and unhandled irqs
34 * @irq: the interrupt number 25 * @irq: the interrupt number
@@ -43,304 +34,6 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
43 ack_bad_irq(irq); 34 ack_bad_irq(irq);
44} 35}
45 36
46#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
47static void __init init_irq_default_affinity(void)
48{
49 alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
50 cpumask_setall(irq_default_affinity);
51}
52#else
53static void __init init_irq_default_affinity(void)
54{
55}
56#endif
57
58/*
59 * Linux has a controller-independent interrupt architecture.
60 * Every controller has a 'controller-template', that is used
61 * by the main code to do the right thing. Each driver-visible
62 * interrupt source is transparently wired to the appropriate
63 * controller. Thus drivers need not be aware of the
64 * interrupt-controller.
65 *
66 * The code is designed to be easily extended with new/different
67 * interrupt controllers, without having to do assembly magic or
68 * having to touch the generic code.
69 *
70 * Controller mappings for all interrupt sources:
71 */
72int nr_irqs = NR_IRQS;
73EXPORT_SYMBOL_GPL(nr_irqs);
74
75#ifdef CONFIG_SPARSE_IRQ
76
77static struct irq_desc irq_desc_init = {
78 .irq = -1,
79 .status = IRQ_DISABLED,
80 .chip = &no_irq_chip,
81 .handle_irq = handle_bad_irq,
82 .depth = 1,
83 .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
84};
85
86void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr)
87{
88 void *ptr;
89
90 ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs),
91 GFP_ATOMIC, node);
92
93 /*
94 * don't overwite if can not get new one
95 * init_copy_kstat_irqs() could still use old one
96 */
97 if (ptr) {
98 printk(KERN_DEBUG " alloc kstat_irqs on node %d\n", node);
99 desc->kstat_irqs = ptr;
100 }
101}
102
103static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)
104{
105 memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
106
107 raw_spin_lock_init(&desc->lock);
108 desc->irq = irq;
109#ifdef CONFIG_SMP
110 desc->node = node;
111#endif
112 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
113 init_kstat_irqs(desc, node, nr_cpu_ids);
114 if (!desc->kstat_irqs) {
115 printk(KERN_ERR "can not alloc kstat_irqs\n");
116 BUG_ON(1);
117 }
118 if (!alloc_desc_masks(desc, node, false)) {
119 printk(KERN_ERR "can not alloc irq_desc cpumasks\n");
120 BUG_ON(1);
121 }
122 init_desc_masks(desc);
123 arch_init_chip_data(desc, node);
124}
125
126/*
127 * Protect the sparse_irqs:
128 */
129DEFINE_RAW_SPINLOCK(sparse_irq_lock);
130
131static RADIX_TREE(irq_desc_tree, GFP_ATOMIC);
132
133static void set_irq_desc(unsigned int irq, struct irq_desc *desc)
134{
135 radix_tree_insert(&irq_desc_tree, irq, desc);
136}
137
138struct irq_desc *irq_to_desc(unsigned int irq)
139{
140 return radix_tree_lookup(&irq_desc_tree, irq);
141}
142
143void replace_irq_desc(unsigned int irq, struct irq_desc *desc)
144{
145 void **ptr;
146
147 ptr = radix_tree_lookup_slot(&irq_desc_tree, irq);
148 if (ptr)
149 radix_tree_replace_slot(ptr, desc);
150}
151
152static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
153 [0 ... NR_IRQS_LEGACY-1] = {
154 .irq = -1,
155 .status = IRQ_DISABLED,
156 .chip = &no_irq_chip,
157 .handle_irq = handle_bad_irq,
158 .depth = 1,
159 .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
160 }
161};
162
163static unsigned int *kstat_irqs_legacy;
164
165int __init early_irq_init(void)
166{
167 struct irq_desc *desc;
168 int legacy_count;
169 int node;
170 int i;
171
172 init_irq_default_affinity();
173
174 /* initialize nr_irqs based on nr_cpu_ids */
175 arch_probe_nr_irqs();
176 printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d\n", NR_IRQS, nr_irqs);
177
178 desc = irq_desc_legacy;
179 legacy_count = ARRAY_SIZE(irq_desc_legacy);
180 node = first_online_node;
181
182 /* allocate based on nr_cpu_ids */
183 kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids *
184 sizeof(int), GFP_NOWAIT, node);
185
186 for (i = 0; i < legacy_count; i++) {
187 desc[i].irq = i;
188#ifdef CONFIG_SMP
189 desc[i].node = node;
190#endif
191 desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids;
192 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
193 alloc_desc_masks(&desc[i], node, true);
194 init_desc_masks(&desc[i]);
195 set_irq_desc(i, &desc[i]);
196 }
197
198 return arch_early_irq_init();
199}
200
201struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
202{
203 struct irq_desc *desc;
204 unsigned long flags;
205
206 if (irq >= nr_irqs) {
207 WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n",
208 irq, nr_irqs);
209 return NULL;
210 }
211
212 desc = irq_to_desc(irq);
213 if (desc)
214 return desc;
215
216 raw_spin_lock_irqsave(&sparse_irq_lock, flags);
217
218 /* We have to check it to avoid races with another CPU */
219 desc = irq_to_desc(irq);
220 if (desc)
221 goto out_unlock;
222
223 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
224
225 printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node);
226 if (!desc) {
227 printk(KERN_ERR "can not alloc irq_desc\n");
228 BUG_ON(1);
229 }
230 init_one_irq_desc(irq, desc, node);
231
232 set_irq_desc(irq, desc);
233
234out_unlock:
235 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
236
237 return desc;
238}
239
240#else /* !CONFIG_SPARSE_IRQ */
241
242struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
243 [0 ... NR_IRQS-1] = {
244 .status = IRQ_DISABLED,
245 .chip = &no_irq_chip,
246 .handle_irq = handle_bad_irq,
247 .depth = 1,
248 .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock),
249 }
250};
251
252static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS];
253int __init early_irq_init(void)
254{
255 struct irq_desc *desc;
256 int count;
257 int i;
258
259 init_irq_default_affinity();
260
261 printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS);
262
263 desc = irq_desc;
264 count = ARRAY_SIZE(irq_desc);
265
266 for (i = 0; i < count; i++) {
267 desc[i].irq = i;
268 alloc_desc_masks(&desc[i], 0, true);
269 init_desc_masks(&desc[i]);
270 desc[i].kstat_irqs = kstat_irqs_all[i];
271 }
272 return arch_early_irq_init();
273}
274
275struct irq_desc *irq_to_desc(unsigned int irq)
276{
277 return (irq < NR_IRQS) ? irq_desc + irq : NULL;
278}
279
280struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)
281{
282 return irq_to_desc(irq);
283}
284#endif /* !CONFIG_SPARSE_IRQ */
285
286void clear_kstat_irqs(struct irq_desc *desc)
287{
288 memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs)));
289}
290
291/*
292 * What should we do if we get a hw irq event on an illegal vector?
293 * Each architecture has to answer this themself.
294 */
295static void ack_bad(unsigned int irq)
296{
297 struct irq_desc *desc = irq_to_desc(irq);
298
299 print_irq_desc(irq, desc);
300 ack_bad_irq(irq);
301}
302
303/*
304 * NOP functions
305 */
306static void noop(unsigned int irq)
307{
308}
309
310static unsigned int noop_ret(unsigned int irq)
311{
312 return 0;
313}
314
315/*
316 * Generic no controller implementation
317 */
318struct irq_chip no_irq_chip = {
319 .name = "none",
320 .startup = noop_ret,
321 .shutdown = noop,
322 .enable = noop,
323 .disable = noop,
324 .ack = ack_bad,
325 .end = noop,
326};
327
328/*
329 * Generic dummy implementation which can be used for
330 * real dumb interrupt sources
331 */
332struct irq_chip dummy_irq_chip = {
333 .name = "dummy",
334 .startup = noop_ret,
335 .shutdown = noop,
336 .enable = noop,
337 .disable = noop,
338 .ack = noop,
339 .mask = noop,
340 .unmask = noop,
341 .end = noop,
342};
343
344/* 37/*
345 * Special, empty irq handler: 38 * Special, empty irq handler:
346 */ 39 */
@@ -457,20 +150,20 @@ unsigned int __do_IRQ(unsigned int irq)
457 /* 150 /*
458 * No locking required for CPU-local interrupts: 151 * No locking required for CPU-local interrupts:
459 */ 152 */
460 if (desc->chip->ack) 153 if (desc->irq_data.chip->ack)
461 desc->chip->ack(irq); 154 desc->irq_data.chip->ack(irq);
462 if (likely(!(desc->status & IRQ_DISABLED))) { 155 if (likely(!(desc->status & IRQ_DISABLED))) {
463 action_ret = handle_IRQ_event(irq, desc->action); 156 action_ret = handle_IRQ_event(irq, desc->action);
464 if (!noirqdebug) 157 if (!noirqdebug)
465 note_interrupt(irq, desc, action_ret); 158 note_interrupt(irq, desc, action_ret);
466 } 159 }
467 desc->chip->end(irq); 160 desc->irq_data.chip->end(irq);
468 return 1; 161 return 1;
469 } 162 }
470 163
471 raw_spin_lock(&desc->lock); 164 raw_spin_lock(&desc->lock);
472 if (desc->chip->ack) 165 if (desc->irq_data.chip->ack)
473 desc->chip->ack(irq); 166 desc->irq_data.chip->ack(irq);
474 /* 167 /*
475 * REPLAY is when Linux resends an IRQ that was dropped earlier 168 * REPLAY is when Linux resends an IRQ that was dropped earlier
476 * WAITING is used by probe to mark irqs that are being tested 169 * WAITING is used by probe to mark irqs that are being tested
@@ -530,27 +223,9 @@ out:
530 * The ->end() handler has to deal with interrupts which got 223 * The ->end() handler has to deal with interrupts which got
531 * disabled while the handler was running. 224 * disabled while the handler was running.
532 */ 225 */
533 desc->chip->end(irq); 226 desc->irq_data.chip->end(irq);
534 raw_spin_unlock(&desc->lock); 227 raw_spin_unlock(&desc->lock);
535 228
536 return 1; 229 return 1;
537} 230}
538#endif 231#endif
539
540void early_init_irq_lock_class(void)
541{
542 struct irq_desc *desc;
543 int i;
544
545 for_each_irq_desc(i, desc) {
546 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
547 }
548}
549
550unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
551{
552 struct irq_desc *desc = irq_to_desc(irq);
553 return desc ? desc->kstat_irqs[cpu] : 0;
554}
555EXPORT_SYMBOL(kstat_irqs_cpu);
556
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index c63f3bc88f0b..4571ae7e085a 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -1,9 +1,12 @@
1/* 1/*
2 * IRQ subsystem internal functions and variables: 2 * IRQ subsystem internal functions and variables:
3 */ 3 */
4#include <linux/irqdesc.h>
4 5
5extern int noirqdebug; 6extern int noirqdebug;
6 7
8#define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data)
9
7/* Set default functions for irq_chip structures: */ 10/* Set default functions for irq_chip structures: */
8extern void irq_chip_set_defaults(struct irq_chip *chip); 11extern void irq_chip_set_defaults(struct irq_chip *chip);
9 12
@@ -15,21 +18,19 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
15extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); 18extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
16extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); 19extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
17 20
18extern struct lock_class_key irq_desc_lock_class;
19extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); 21extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
20extern void clear_kstat_irqs(struct irq_desc *desc);
21extern raw_spinlock_t sparse_irq_lock;
22 22
23#ifdef CONFIG_SPARSE_IRQ 23/* Resending of interrupts :*/
24void replace_irq_desc(unsigned int irq, struct irq_desc *desc); 24void check_irq_resend(struct irq_desc *desc, unsigned int irq);
25#endif
26 25
27#ifdef CONFIG_PROC_FS 26#ifdef CONFIG_PROC_FS
28extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); 27extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
28extern void unregister_irq_proc(unsigned int irq, struct irq_desc *desc);
29extern void register_handler_proc(unsigned int irq, struct irqaction *action); 29extern void register_handler_proc(unsigned int irq, struct irqaction *action);
30extern void unregister_handler_proc(unsigned int irq, struct irqaction *action); 30extern void unregister_handler_proc(unsigned int irq, struct irqaction *action);
31#else 31#else
32static inline void register_irq_proc(unsigned int irq, struct irq_desc *desc) { } 32static inline void register_irq_proc(unsigned int irq, struct irq_desc *desc) { }
33static inline void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) { }
33static inline void register_handler_proc(unsigned int irq, 34static inline void register_handler_proc(unsigned int irq,
34 struct irqaction *action) { } 35 struct irqaction *action) { }
35static inline void unregister_handler_proc(unsigned int irq, 36static inline void unregister_handler_proc(unsigned int irq,
@@ -40,17 +41,27 @@ extern int irq_select_affinity_usr(unsigned int irq);
40 41
41extern void irq_set_thread_affinity(struct irq_desc *desc); 42extern void irq_set_thread_affinity(struct irq_desc *desc);
42 43
44#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
45static inline void irq_end(unsigned int irq, struct irq_desc *desc)
46{
47 if (desc->irq_data.chip && desc->irq_data.chip->end)
48 desc->irq_data.chip->end(irq);
49}
50#else
51static inline void irq_end(unsigned int irq, struct irq_desc *desc) { }
52#endif
53
43/* Inline functions for support of irq chips on slow busses */ 54/* Inline functions for support of irq chips on slow busses */
44static inline void chip_bus_lock(unsigned int irq, struct irq_desc *desc) 55static inline void chip_bus_lock(struct irq_desc *desc)
45{ 56{
46 if (unlikely(desc->chip->bus_lock)) 57 if (unlikely(desc->irq_data.chip->irq_bus_lock))
47 desc->chip->bus_lock(irq); 58 desc->irq_data.chip->irq_bus_lock(&desc->irq_data);
48} 59}
49 60
50static inline void chip_bus_sync_unlock(unsigned int irq, struct irq_desc *desc) 61static inline void chip_bus_sync_unlock(struct irq_desc *desc)
51{ 62{
52 if (unlikely(desc->chip->bus_sync_unlock)) 63 if (unlikely(desc->irq_data.chip->irq_bus_sync_unlock))
53 desc->chip->bus_sync_unlock(irq); 64 desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data);
54} 65}
55 66
56/* 67/*
@@ -67,8 +78,8 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
67 irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled); 78 irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
68 printk("->handle_irq(): %p, ", desc->handle_irq); 79 printk("->handle_irq(): %p, ", desc->handle_irq);
69 print_symbol("%s\n", (unsigned long)desc->handle_irq); 80 print_symbol("%s\n", (unsigned long)desc->handle_irq);
70 printk("->chip(): %p, ", desc->chip); 81 printk("->irq_data.chip(): %p, ", desc->irq_data.chip);
71 print_symbol("%s\n", (unsigned long)desc->chip); 82 print_symbol("%s\n", (unsigned long)desc->irq_data.chip);
72 printk("->action(): %p\n", desc->action); 83 printk("->action(): %p\n", desc->action);
73 if (desc->action) { 84 if (desc->action) {
74 printk("->action->handler(): %p, ", desc->action->handler); 85 printk("->action->handler(): %p, ", desc->action->handler);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
new file mode 100644
index 000000000000..9d917ff72675
--- /dev/null
+++ b/kernel/irq/irqdesc.c
@@ -0,0 +1,395 @@
1/*
2 * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
3 * Copyright (C) 2005-2006, Thomas Gleixner, Russell King
4 *
5 * This file contains the interrupt descriptor management code
6 *
7 * Detailed information is available in Documentation/DocBook/genericirq
8 *
9 */
10#include <linux/irq.h>
11#include <linux/slab.h>
12#include <linux/module.h>
13#include <linux/interrupt.h>
14#include <linux/kernel_stat.h>
15#include <linux/radix-tree.h>
16#include <linux/bitmap.h>
17
18#include "internals.h"
19
20/*
21 * lockdep: we want to handle all irq_desc locks as a single lock-class:
22 */
23static struct lock_class_key irq_desc_lock_class;
24
25#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
26static void __init init_irq_default_affinity(void)
27{
28 alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
29 cpumask_setall(irq_default_affinity);
30}
31#else
32static void __init init_irq_default_affinity(void)
33{
34}
35#endif
36
37#ifdef CONFIG_SMP
38static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node)
39{
40 if (!zalloc_cpumask_var_node(&desc->irq_data.affinity, gfp, node))
41 return -ENOMEM;
42
43#ifdef CONFIG_GENERIC_PENDING_IRQ
44 if (!zalloc_cpumask_var_node(&desc->pending_mask, gfp, node)) {
45 free_cpumask_var(desc->irq_data.affinity);
46 return -ENOMEM;
47 }
48#endif
49 return 0;
50}
51
52static void desc_smp_init(struct irq_desc *desc, int node)
53{
54 desc->irq_data.node = node;
55 cpumask_copy(desc->irq_data.affinity, irq_default_affinity);
56#ifdef CONFIG_GENERIC_PENDING_IRQ
57 cpumask_clear(desc->pending_mask);
58#endif
59}
60
61static inline int desc_node(struct irq_desc *desc)
62{
63 return desc->irq_data.node;
64}
65
66#else
67static inline int
68alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { return 0; }
69static inline void desc_smp_init(struct irq_desc *desc, int node) { }
70static inline int desc_node(struct irq_desc *desc) { return 0; }
71#endif
72
73static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
74{
75 desc->irq_data.irq = irq;
76 desc->irq_data.chip = &no_irq_chip;
77 desc->irq_data.chip_data = NULL;
78 desc->irq_data.handler_data = NULL;
79 desc->irq_data.msi_desc = NULL;
80 desc->status = IRQ_DEFAULT_INIT_FLAGS;
81 desc->handle_irq = handle_bad_irq;
82 desc->depth = 1;
83 desc->irq_count = 0;
84 desc->irqs_unhandled = 0;
85 desc->name = NULL;
86 memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs)));
87 desc_smp_init(desc, node);
88}
89
90int nr_irqs = NR_IRQS;
91EXPORT_SYMBOL_GPL(nr_irqs);
92
93static DEFINE_MUTEX(sparse_irq_lock);
94static DECLARE_BITMAP(allocated_irqs, NR_IRQS);
95
96#ifdef CONFIG_SPARSE_IRQ
97
98static RADIX_TREE(irq_desc_tree, GFP_KERNEL);
99
100static void irq_insert_desc(unsigned int irq, struct irq_desc *desc)
101{
102 radix_tree_insert(&irq_desc_tree, irq, desc);
103}
104
105struct irq_desc *irq_to_desc(unsigned int irq)
106{
107 return radix_tree_lookup(&irq_desc_tree, irq);
108}
109
110static void delete_irq_desc(unsigned int irq)
111{
112 radix_tree_delete(&irq_desc_tree, irq);
113}
114
115#ifdef CONFIG_SMP
116static void free_masks(struct irq_desc *desc)
117{
118#ifdef CONFIG_GENERIC_PENDING_IRQ
119 free_cpumask_var(desc->pending_mask);
120#endif
121 free_cpumask_var(desc->irq_data.affinity);
122}
123#else
124static inline void free_masks(struct irq_desc *desc) { }
125#endif
126
127static struct irq_desc *alloc_desc(int irq, int node)
128{
129 struct irq_desc *desc;
130 gfp_t gfp = GFP_KERNEL;
131
132 desc = kzalloc_node(sizeof(*desc), gfp, node);
133 if (!desc)
134 return NULL;
135 /* allocate based on nr_cpu_ids */
136 desc->kstat_irqs = kzalloc_node(nr_cpu_ids * sizeof(*desc->kstat_irqs),
137 gfp, node);
138 if (!desc->kstat_irqs)
139 goto err_desc;
140
141 if (alloc_masks(desc, gfp, node))
142 goto err_kstat;
143
144 raw_spin_lock_init(&desc->lock);
145 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
146
147 desc_set_defaults(irq, desc, node);
148
149 return desc;
150
151err_kstat:
152 kfree(desc->kstat_irqs);
153err_desc:
154 kfree(desc);
155 return NULL;
156}
157
158static void free_desc(unsigned int irq)
159{
160 struct irq_desc *desc = irq_to_desc(irq);
161
162 unregister_irq_proc(irq, desc);
163
164 mutex_lock(&sparse_irq_lock);
165 delete_irq_desc(irq);
166 mutex_unlock(&sparse_irq_lock);
167
168 free_masks(desc);
169 kfree(desc->kstat_irqs);
170 kfree(desc);
171}
172
173static int alloc_descs(unsigned int start, unsigned int cnt, int node)
174{
175 struct irq_desc *desc;
176 int i;
177
178 for (i = 0; i < cnt; i++) {
179 desc = alloc_desc(start + i, node);
180 if (!desc)
181 goto err;
182 mutex_lock(&sparse_irq_lock);
183 irq_insert_desc(start + i, desc);
184 mutex_unlock(&sparse_irq_lock);
185 }
186 return start;
187
188err:
189 for (i--; i >= 0; i--)
190 free_desc(start + i);
191
192 mutex_lock(&sparse_irq_lock);
193 bitmap_clear(allocated_irqs, start, cnt);
194 mutex_unlock(&sparse_irq_lock);
195 return -ENOMEM;
196}
197
198struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
199{
200 int res = irq_alloc_descs(irq, irq, 1, node);
201
202 if (res == -EEXIST || res == irq)
203 return irq_to_desc(irq);
204 return NULL;
205}
206
207int __init early_irq_init(void)
208{
209 int i, initcnt, node = first_online_node;
210 struct irq_desc *desc;
211
212 init_irq_default_affinity();
213
214 /* Let arch update nr_irqs and return the nr of preallocated irqs */
215 initcnt = arch_probe_nr_irqs();
216 printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d %d\n", NR_IRQS, nr_irqs, initcnt);
217
218 for (i = 0; i < initcnt; i++) {
219 desc = alloc_desc(i, node);
220 set_bit(i, allocated_irqs);
221 irq_insert_desc(i, desc);
222 }
223 return arch_early_irq_init();
224}
225
226#else /* !CONFIG_SPARSE_IRQ */
227
228struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
229 [0 ... NR_IRQS-1] = {
230 .status = IRQ_DEFAULT_INIT_FLAGS,
231 .handle_irq = handle_bad_irq,
232 .depth = 1,
233 .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock),
234 }
235};
236
237static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS];
238int __init early_irq_init(void)
239{
240 int count, i, node = first_online_node;
241 struct irq_desc *desc;
242
243 init_irq_default_affinity();
244
245 printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS);
246
247 desc = irq_desc;
248 count = ARRAY_SIZE(irq_desc);
249
250 for (i = 0; i < count; i++) {
251 desc[i].irq_data.irq = i;
252 desc[i].irq_data.chip = &no_irq_chip;
253 desc[i].kstat_irqs = kstat_irqs_all[i];
254 alloc_masks(desc + i, GFP_KERNEL, node);
255 desc_smp_init(desc + i, node);
256 lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
257 }
258 return arch_early_irq_init();
259}
260
261struct irq_desc *irq_to_desc(unsigned int irq)
262{
263 return (irq < NR_IRQS) ? irq_desc + irq : NULL;
264}
265
266struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)
267{
268 return irq_to_desc(irq);
269}
270
271static void free_desc(unsigned int irq)
272{
273 dynamic_irq_cleanup(irq);
274}
275
276static inline int alloc_descs(unsigned int start, unsigned int cnt, int node)
277{
278 return start;
279}
280#endif /* !CONFIG_SPARSE_IRQ */
281
282/* Dynamic interrupt handling */
283
284/**
285 * irq_free_descs - free irq descriptors
286 * @from: Start of descriptor range
287 * @cnt: Number of consecutive irqs to free
288 */
289void irq_free_descs(unsigned int from, unsigned int cnt)
290{
291 int i;
292
293 if (from >= nr_irqs || (from + cnt) > nr_irqs)
294 return;
295
296 for (i = 0; i < cnt; i++)
297 free_desc(from + i);
298
299 mutex_lock(&sparse_irq_lock);
300 bitmap_clear(allocated_irqs, from, cnt);
301 mutex_unlock(&sparse_irq_lock);
302}
303
304/**
305 * irq_alloc_descs - allocate and initialize a range of irq descriptors
306 * @irq: Allocate for specific irq number if irq >= 0
307 * @from: Start the search from this irq number
308 * @cnt: Number of consecutive irqs to allocate.
309 * @node: Preferred node on which the irq descriptor should be allocated
310 *
311 * Returns the first irq number or error code
312 */
313int __ref
314irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node)
315{
316 int start, ret;
317
318 if (!cnt)
319 return -EINVAL;
320
321 mutex_lock(&sparse_irq_lock);
322
323 start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0);
324 ret = -EEXIST;
325 if (irq >=0 && start != irq)
326 goto err;
327
328 ret = -ENOMEM;
329 if (start >= nr_irqs)
330 goto err;
331
332 bitmap_set(allocated_irqs, start, cnt);
333 mutex_unlock(&sparse_irq_lock);
334 return alloc_descs(start, cnt, node);
335
336err:
337 mutex_unlock(&sparse_irq_lock);
338 return ret;
339}
340
341/**
342 * irq_reserve_irqs - mark irqs allocated
343 * @from: mark from irq number
344 * @cnt: number of irqs to mark
345 *
346 * Returns 0 on success or an appropriate error code
347 */
348int irq_reserve_irqs(unsigned int from, unsigned int cnt)
349{
350 unsigned int start;
351 int ret = 0;
352
353 if (!cnt || (from + cnt) > nr_irqs)
354 return -EINVAL;
355
356 mutex_lock(&sparse_irq_lock);
357 start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0);
358 if (start == from)
359 bitmap_set(allocated_irqs, start, cnt);
360 else
361 ret = -EEXIST;
362 mutex_unlock(&sparse_irq_lock);
363 return ret;
364}
365
366/**
367 * irq_get_next_irq - get next allocated irq number
368 * @offset: where to start the search
369 *
370 * Returns next irq number after offset or nr_irqs if none is found.
371 */
372unsigned int irq_get_next_irq(unsigned int offset)
373{
374 return find_next_bit(allocated_irqs, nr_irqs, offset);
375}
376
377/**
378 * dynamic_irq_cleanup - cleanup a dynamically allocated irq
379 * @irq: irq number to initialize
380 */
381void dynamic_irq_cleanup(unsigned int irq)
382{
383 struct irq_desc *desc = irq_to_desc(irq);
384 unsigned long flags;
385
386 raw_spin_lock_irqsave(&desc->lock, flags);
387 desc_set_defaults(irq, desc, desc_node(desc));
388 raw_spin_unlock_irqrestore(&desc->lock, flags);
389}
390
391unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
392{
393 struct irq_desc *desc = irq_to_desc(irq);
394 return desc ? desc->kstat_irqs[cpu] : 0;
395}
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index c3003e9d91a3..644e8d5fa367 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -73,8 +73,8 @@ int irq_can_set_affinity(unsigned int irq)
73{ 73{
74 struct irq_desc *desc = irq_to_desc(irq); 74 struct irq_desc *desc = irq_to_desc(irq);
75 75
76 if (CHECK_IRQ_PER_CPU(desc->status) || !desc->chip || 76 if (CHECK_IRQ_PER_CPU(desc->status) || !desc->irq_data.chip ||
77 !desc->chip->set_affinity) 77 !desc->irq_data.chip->irq_set_affinity)
78 return 0; 78 return 0;
79 79
80 return 1; 80 return 1;
@@ -109,17 +109,18 @@ void irq_set_thread_affinity(struct irq_desc *desc)
109int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) 109int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
110{ 110{
111 struct irq_desc *desc = irq_to_desc(irq); 111 struct irq_desc *desc = irq_to_desc(irq);
112 struct irq_chip *chip = desc->irq_data.chip;
112 unsigned long flags; 113 unsigned long flags;
113 114
114 if (!desc->chip->set_affinity) 115 if (!chip->irq_set_affinity)
115 return -EINVAL; 116 return -EINVAL;
116 117
117 raw_spin_lock_irqsave(&desc->lock, flags); 118 raw_spin_lock_irqsave(&desc->lock, flags);
118 119
119#ifdef CONFIG_GENERIC_PENDING_IRQ 120#ifdef CONFIG_GENERIC_PENDING_IRQ
120 if (desc->status & IRQ_MOVE_PCNTXT) { 121 if (desc->status & IRQ_MOVE_PCNTXT) {
121 if (!desc->chip->set_affinity(irq, cpumask)) { 122 if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) {
122 cpumask_copy(desc->affinity, cpumask); 123 cpumask_copy(desc->irq_data.affinity, cpumask);
123 irq_set_thread_affinity(desc); 124 irq_set_thread_affinity(desc);
124 } 125 }
125 } 126 }
@@ -128,8 +129,8 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
128 cpumask_copy(desc->pending_mask, cpumask); 129 cpumask_copy(desc->pending_mask, cpumask);
129 } 130 }
130#else 131#else
131 if (!desc->chip->set_affinity(irq, cpumask)) { 132 if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) {
132 cpumask_copy(desc->affinity, cpumask); 133 cpumask_copy(desc->irq_data.affinity, cpumask);
133 irq_set_thread_affinity(desc); 134 irq_set_thread_affinity(desc);
134 } 135 }
135#endif 136#endif
@@ -168,16 +169,16 @@ static int setup_affinity(unsigned int irq, struct irq_desc *desc)
168 * one of the targets is online. 169 * one of the targets is online.
169 */ 170 */
170 if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) { 171 if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) {
171 if (cpumask_any_and(desc->affinity, cpu_online_mask) 172 if (cpumask_any_and(desc->irq_data.affinity, cpu_online_mask)
172 < nr_cpu_ids) 173 < nr_cpu_ids)
173 goto set_affinity; 174 goto set_affinity;
174 else 175 else
175 desc->status &= ~IRQ_AFFINITY_SET; 176 desc->status &= ~IRQ_AFFINITY_SET;
176 } 177 }
177 178
178 cpumask_and(desc->affinity, cpu_online_mask, irq_default_affinity); 179 cpumask_and(desc->irq_data.affinity, cpu_online_mask, irq_default_affinity);
179set_affinity: 180set_affinity:
180 desc->chip->set_affinity(irq, desc->affinity); 181 desc->irq_data.chip->irq_set_affinity(&desc->irq_data, desc->irq_data.affinity, false);
181 182
182 return 0; 183 return 0;
183} 184}
@@ -223,7 +224,7 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
223 224
224 if (!desc->depth++) { 225 if (!desc->depth++) {
225 desc->status |= IRQ_DISABLED; 226 desc->status |= IRQ_DISABLED;
226 desc->chip->disable(irq); 227 desc->irq_data.chip->irq_disable(&desc->irq_data);
227 } 228 }
228} 229}
229 230
@@ -246,11 +247,11 @@ void disable_irq_nosync(unsigned int irq)
246 if (!desc) 247 if (!desc)
247 return; 248 return;
248 249
249 chip_bus_lock(irq, desc); 250 chip_bus_lock(desc);
250 raw_spin_lock_irqsave(&desc->lock, flags); 251 raw_spin_lock_irqsave(&desc->lock, flags);
251 __disable_irq(desc, irq, false); 252 __disable_irq(desc, irq, false);
252 raw_spin_unlock_irqrestore(&desc->lock, flags); 253 raw_spin_unlock_irqrestore(&desc->lock, flags);
253 chip_bus_sync_unlock(irq, desc); 254 chip_bus_sync_unlock(desc);
254} 255}
255EXPORT_SYMBOL(disable_irq_nosync); 256EXPORT_SYMBOL(disable_irq_nosync);
256 257
@@ -313,7 +314,7 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
313 * IRQ line is re-enabled. 314 * IRQ line is re-enabled.
314 * 315 *
315 * This function may be called from IRQ context only when 316 * This function may be called from IRQ context only when
316 * desc->chip->bus_lock and desc->chip->bus_sync_unlock are NULL ! 317 * desc->irq_data.chip->bus_lock and desc->chip->bus_sync_unlock are NULL !
317 */ 318 */
318void enable_irq(unsigned int irq) 319void enable_irq(unsigned int irq)
319{ 320{
@@ -323,11 +324,11 @@ void enable_irq(unsigned int irq)
323 if (!desc) 324 if (!desc)
324 return; 325 return;
325 326
326 chip_bus_lock(irq, desc); 327 chip_bus_lock(desc);
327 raw_spin_lock_irqsave(&desc->lock, flags); 328 raw_spin_lock_irqsave(&desc->lock, flags);
328 __enable_irq(desc, irq, false); 329 __enable_irq(desc, irq, false);
329 raw_spin_unlock_irqrestore(&desc->lock, flags); 330 raw_spin_unlock_irqrestore(&desc->lock, flags);
330 chip_bus_sync_unlock(irq, desc); 331 chip_bus_sync_unlock(desc);
331} 332}
332EXPORT_SYMBOL(enable_irq); 333EXPORT_SYMBOL(enable_irq);
333 334
@@ -336,8 +337,8 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
336 struct irq_desc *desc = irq_to_desc(irq); 337 struct irq_desc *desc = irq_to_desc(irq);
337 int ret = -ENXIO; 338 int ret = -ENXIO;
338 339
339 if (desc->chip->set_wake) 340 if (desc->irq_data.chip->irq_set_wake)
340 ret = desc->chip->set_wake(irq, on); 341 ret = desc->irq_data.chip->irq_set_wake(&desc->irq_data, on);
341 342
342 return ret; 343 return ret;
343} 344}
@@ -429,12 +430,12 @@ void compat_irq_chip_set_default_handler(struct irq_desc *desc)
429} 430}
430 431
431int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, 432int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
432 unsigned long flags) 433 unsigned long flags)
433{ 434{
434 int ret; 435 int ret;
435 struct irq_chip *chip = desc->chip; 436 struct irq_chip *chip = desc->irq_data.chip;
436 437
437 if (!chip || !chip->set_type) { 438 if (!chip || !chip->irq_set_type) {
438 /* 439 /*
439 * IRQF_TRIGGER_* but the PIC does not support multiple 440 * IRQF_TRIGGER_* but the PIC does not support multiple
440 * flow-types? 441 * flow-types?
@@ -445,11 +446,11 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
445 } 446 }
446 447
447 /* caller masked out all except trigger mode flags */ 448 /* caller masked out all except trigger mode flags */
448 ret = chip->set_type(irq, flags); 449 ret = chip->irq_set_type(&desc->irq_data, flags);
449 450
450 if (ret) 451 if (ret)
451 pr_err("setting trigger mode %d for irq %u failed (%pF)\n", 452 pr_err("setting trigger mode %lu for irq %u failed (%pF)\n",
452 (int)flags, irq, chip->set_type); 453 flags, irq, chip->irq_set_type);
453 else { 454 else {
454 if (flags & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH)) 455 if (flags & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH))
455 flags |= IRQ_LEVEL; 456 flags |= IRQ_LEVEL;
@@ -457,8 +458,8 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
457 desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK); 458 desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK);
458 desc->status |= flags; 459 desc->status |= flags;
459 460
460 if (chip != desc->chip) 461 if (chip != desc->irq_data.chip)
461 irq_chip_set_defaults(desc->chip); 462 irq_chip_set_defaults(desc->irq_data.chip);
462 } 463 }
463 464
464 return ret; 465 return ret;
@@ -507,7 +508,7 @@ static int irq_wait_for_interrupt(struct irqaction *action)
507static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc) 508static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc)
508{ 509{
509again: 510again:
510 chip_bus_lock(irq, desc); 511 chip_bus_lock(desc);
511 raw_spin_lock_irq(&desc->lock); 512 raw_spin_lock_irq(&desc->lock);
512 513
513 /* 514 /*
@@ -521,17 +522,17 @@ again:
521 */ 522 */
522 if (unlikely(desc->status & IRQ_INPROGRESS)) { 523 if (unlikely(desc->status & IRQ_INPROGRESS)) {
523 raw_spin_unlock_irq(&desc->lock); 524 raw_spin_unlock_irq(&desc->lock);
524 chip_bus_sync_unlock(irq, desc); 525 chip_bus_sync_unlock(desc);
525 cpu_relax(); 526 cpu_relax();
526 goto again; 527 goto again;
527 } 528 }
528 529
529 if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) { 530 if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) {
530 desc->status &= ~IRQ_MASKED; 531 desc->status &= ~IRQ_MASKED;
531 desc->chip->unmask(irq); 532 desc->irq_data.chip->irq_unmask(&desc->irq_data);
532 } 533 }
533 raw_spin_unlock_irq(&desc->lock); 534 raw_spin_unlock_irq(&desc->lock);
534 chip_bus_sync_unlock(irq, desc); 535 chip_bus_sync_unlock(desc);
535} 536}
536 537
537#ifdef CONFIG_SMP 538#ifdef CONFIG_SMP
@@ -556,7 +557,7 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
556 } 557 }
557 558
558 raw_spin_lock_irq(&desc->lock); 559 raw_spin_lock_irq(&desc->lock);
559 cpumask_copy(mask, desc->affinity); 560 cpumask_copy(mask, desc->irq_data.affinity);
560 raw_spin_unlock_irq(&desc->lock); 561 raw_spin_unlock_irq(&desc->lock);
561 562
562 set_cpus_allowed_ptr(current, mask); 563 set_cpus_allowed_ptr(current, mask);
@@ -657,7 +658,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
657 if (!desc) 658 if (!desc)
658 return -EINVAL; 659 return -EINVAL;
659 660
660 if (desc->chip == &no_irq_chip) 661 if (desc->irq_data.chip == &no_irq_chip)
661 return -ENOSYS; 662 return -ENOSYS;
662 /* 663 /*
663 * Some drivers like serial.c use request_irq() heavily, 664 * Some drivers like serial.c use request_irq() heavily,
@@ -752,7 +753,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
752 } 753 }
753 754
754 if (!shared) { 755 if (!shared) {
755 irq_chip_set_defaults(desc->chip); 756 irq_chip_set_defaults(desc->irq_data.chip);
756 757
757 init_waitqueue_head(&desc->wait_for_threads); 758 init_waitqueue_head(&desc->wait_for_threads);
758 759
@@ -779,7 +780,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
779 if (!(desc->status & IRQ_NOAUTOEN)) { 780 if (!(desc->status & IRQ_NOAUTOEN)) {
780 desc->depth = 0; 781 desc->depth = 0;
781 desc->status &= ~IRQ_DISABLED; 782 desc->status &= ~IRQ_DISABLED;
782 desc->chip->startup(irq); 783 desc->irq_data.chip->irq_startup(&desc->irq_data);
783 } else 784 } else
784 /* Undo nested disables: */ 785 /* Undo nested disables: */
785 desc->depth = 1; 786 desc->depth = 1;
@@ -912,17 +913,17 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
912 913
913 /* Currently used only by UML, might disappear one day: */ 914 /* Currently used only by UML, might disappear one day: */
914#ifdef CONFIG_IRQ_RELEASE_METHOD 915#ifdef CONFIG_IRQ_RELEASE_METHOD
915 if (desc->chip->release) 916 if (desc->irq_data.chip->release)
916 desc->chip->release(irq, dev_id); 917 desc->irq_data.chip->release(irq, dev_id);
917#endif 918#endif
918 919
919 /* If this was the last handler, shut down the IRQ line: */ 920 /* If this was the last handler, shut down the IRQ line: */
920 if (!desc->action) { 921 if (!desc->action) {
921 desc->status |= IRQ_DISABLED; 922 desc->status |= IRQ_DISABLED;
922 if (desc->chip->shutdown) 923 if (desc->irq_data.chip->irq_shutdown)
923 desc->chip->shutdown(irq); 924 desc->irq_data.chip->irq_shutdown(&desc->irq_data);
924 else 925 else
925 desc->chip->disable(irq); 926 desc->irq_data.chip->irq_disable(&desc->irq_data);
926 } 927 }
927 928
928#ifdef CONFIG_SMP 929#ifdef CONFIG_SMP
@@ -997,9 +998,9 @@ void free_irq(unsigned int irq, void *dev_id)
997 if (!desc) 998 if (!desc)
998 return; 999 return;
999 1000
1000 chip_bus_lock(irq, desc); 1001 chip_bus_lock(desc);
1001 kfree(__free_irq(irq, dev_id)); 1002 kfree(__free_irq(irq, dev_id));
1002 chip_bus_sync_unlock(irq, desc); 1003 chip_bus_sync_unlock(desc);
1003} 1004}
1004EXPORT_SYMBOL(free_irq); 1005EXPORT_SYMBOL(free_irq);
1005 1006
@@ -1086,9 +1087,9 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
1086 action->name = devname; 1087 action->name = devname;
1087 action->dev_id = dev_id; 1088 action->dev_id = dev_id;
1088 1089
1089 chip_bus_lock(irq, desc); 1090 chip_bus_lock(desc);
1090 retval = __setup_irq(irq, desc, action); 1091 retval = __setup_irq(irq, desc, action);
1091 chip_bus_sync_unlock(irq, desc); 1092 chip_bus_sync_unlock(desc);
1092 1093
1093 if (retval) 1094 if (retval)
1094 kfree(action); 1095 kfree(action);
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 241962280836..1d2541940480 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -7,6 +7,7 @@
7void move_masked_irq(int irq) 7void move_masked_irq(int irq)
8{ 8{
9 struct irq_desc *desc = irq_to_desc(irq); 9 struct irq_desc *desc = irq_to_desc(irq);
10 struct irq_chip *chip = desc->irq_data.chip;
10 11
11 if (likely(!(desc->status & IRQ_MOVE_PENDING))) 12 if (likely(!(desc->status & IRQ_MOVE_PENDING)))
12 return; 13 return;
@@ -24,7 +25,7 @@ void move_masked_irq(int irq)
24 if (unlikely(cpumask_empty(desc->pending_mask))) 25 if (unlikely(cpumask_empty(desc->pending_mask)))
25 return; 26 return;
26 27
27 if (!desc->chip->set_affinity) 28 if (!chip->irq_set_affinity)
28 return; 29 return;
29 30
30 assert_raw_spin_locked(&desc->lock); 31 assert_raw_spin_locked(&desc->lock);
@@ -43,8 +44,9 @@ void move_masked_irq(int irq)
43 */ 44 */
44 if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) 45 if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask)
45 < nr_cpu_ids)) 46 < nr_cpu_ids))
46 if (!desc->chip->set_affinity(irq, desc->pending_mask)) { 47 if (!chip->irq_set_affinity(&desc->irq_data,
47 cpumask_copy(desc->affinity, desc->pending_mask); 48 desc->pending_mask, false)) {
49 cpumask_copy(desc->irq_data.affinity, desc->pending_mask);
48 irq_set_thread_affinity(desc); 50 irq_set_thread_affinity(desc);
49 } 51 }
50 52
@@ -61,8 +63,8 @@ void move_native_irq(int irq)
61 if (unlikely(desc->status & IRQ_DISABLED)) 63 if (unlikely(desc->status & IRQ_DISABLED))
62 return; 64 return;
63 65
64 desc->chip->mask(irq); 66 desc->irq_data.chip->irq_mask(&desc->irq_data);
65 move_masked_irq(irq); 67 move_masked_irq(irq);
66 desc->chip->unmask(irq); 68 desc->irq_data.chip->irq_unmask(&desc->irq_data);
67} 69}
68 70
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
deleted file mode 100644
index 65d3845665ac..000000000000
--- a/kernel/irq/numa_migrate.c
+++ /dev/null
@@ -1,120 +0,0 @@
1/*
2 * NUMA irq-desc migration code
3 *
4 * Migrate IRQ data structures (irq_desc, chip_data, etc.) over to
5 * the new "home node" of the IRQ.
6 */
7
8#include <linux/irq.h>
9#include <linux/slab.h>
10#include <linux/module.h>
11#include <linux/random.h>
12#include <linux/interrupt.h>
13#include <linux/kernel_stat.h>
14
15#include "internals.h"
16
17static void init_copy_kstat_irqs(struct irq_desc *old_desc,
18 struct irq_desc *desc,
19 int node, int nr)
20{
21 init_kstat_irqs(desc, node, nr);
22
23 if (desc->kstat_irqs != old_desc->kstat_irqs)
24 memcpy(desc->kstat_irqs, old_desc->kstat_irqs,
25 nr * sizeof(*desc->kstat_irqs));
26}
27
28static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
29{
30 if (old_desc->kstat_irqs == desc->kstat_irqs)
31 return;
32
33 kfree(old_desc->kstat_irqs);
34 old_desc->kstat_irqs = NULL;
35}
36
37static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
38 struct irq_desc *desc, int node)
39{
40 memcpy(desc, old_desc, sizeof(struct irq_desc));
41 if (!alloc_desc_masks(desc, node, false)) {
42 printk(KERN_ERR "irq %d: can not get new irq_desc cpumask "
43 "for migration.\n", irq);
44 return false;
45 }
46 raw_spin_lock_init(&desc->lock);
47 desc->node = node;
48 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
49 init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids);
50 init_copy_desc_masks(old_desc, desc);
51 arch_init_copy_chip_data(old_desc, desc, node);
52 return true;
53}
54
55static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
56{
57 free_kstat_irqs(old_desc, desc);
58 free_desc_masks(old_desc, desc);
59 arch_free_chip_data(old_desc, desc);
60}
61
62static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
63 int node)
64{
65 struct irq_desc *desc;
66 unsigned int irq;
67 unsigned long flags;
68
69 irq = old_desc->irq;
70
71 raw_spin_lock_irqsave(&sparse_irq_lock, flags);
72
73 /* We have to check it to avoid races with another CPU */
74 desc = irq_to_desc(irq);
75
76 if (desc && old_desc != desc)
77 goto out_unlock;
78
79 desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
80 if (!desc) {
81 printk(KERN_ERR "irq %d: can not get new irq_desc "
82 "for migration.\n", irq);
83 /* still use old one */
84 desc = old_desc;
85 goto out_unlock;
86 }
87 if (!init_copy_one_irq_desc(irq, old_desc, desc, node)) {
88 /* still use old one */
89 kfree(desc);
90 desc = old_desc;
91 goto out_unlock;
92 }
93
94 replace_irq_desc(irq, desc);
95 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
96
97 /* free the old one */
98 free_one_irq_desc(old_desc, desc);
99 kfree(old_desc);
100
101 return desc;
102
103out_unlock:
104 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
105
106 return desc;
107}
108
109struct irq_desc *move_irq_desc(struct irq_desc *desc, int node)
110{
111 /* those static or target node is -1, do not move them */
112 if (desc->irq < NR_IRQS_LEGACY || node == -1)
113 return desc;
114
115 if (desc->node != node)
116 desc = __real_move_irq_desc(desc, node);
117
118 return desc;
119}
120
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 09a2ee540bd2..01b1d3a88983 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -21,7 +21,7 @@ static struct proc_dir_entry *root_irq_dir;
21static int irq_affinity_proc_show(struct seq_file *m, void *v) 21static int irq_affinity_proc_show(struct seq_file *m, void *v)
22{ 22{
23 struct irq_desc *desc = irq_to_desc((long)m->private); 23 struct irq_desc *desc = irq_to_desc((long)m->private);
24 const struct cpumask *mask = desc->affinity; 24 const struct cpumask *mask = desc->irq_data.affinity;
25 25
26#ifdef CONFIG_GENERIC_PENDING_IRQ 26#ifdef CONFIG_GENERIC_PENDING_IRQ
27 if (desc->status & IRQ_MOVE_PENDING) 27 if (desc->status & IRQ_MOVE_PENDING)
@@ -65,7 +65,7 @@ static ssize_t irq_affinity_proc_write(struct file *file,
65 cpumask_var_t new_value; 65 cpumask_var_t new_value;
66 int err; 66 int err;
67 67
68 if (!irq_to_desc(irq)->chip->set_affinity || no_irq_affinity || 68 if (!irq_to_desc(irq)->irq_data.chip->irq_set_affinity || no_irq_affinity ||
69 irq_balancing_disabled(irq)) 69 irq_balancing_disabled(irq))
70 return -EIO; 70 return -EIO;
71 71
@@ -185,7 +185,7 @@ static int irq_node_proc_show(struct seq_file *m, void *v)
185{ 185{
186 struct irq_desc *desc = irq_to_desc((long) m->private); 186 struct irq_desc *desc = irq_to_desc((long) m->private);
187 187
188 seq_printf(m, "%d\n", desc->node); 188 seq_printf(m, "%d\n", desc->irq_data.node);
189 return 0; 189 return 0;
190} 190}
191 191
@@ -269,7 +269,7 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
269{ 269{
270 char name [MAX_NAMELEN]; 270 char name [MAX_NAMELEN];
271 271
272 if (!root_irq_dir || (desc->chip == &no_irq_chip) || desc->dir) 272 if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip) || desc->dir)
273 return; 273 return;
274 274
275 memset(name, 0, MAX_NAMELEN); 275 memset(name, 0, MAX_NAMELEN);
@@ -297,6 +297,24 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
297 &irq_spurious_proc_fops, (void *)(long)irq); 297 &irq_spurious_proc_fops, (void *)(long)irq);
298} 298}
299 299
300void unregister_irq_proc(unsigned int irq, struct irq_desc *desc)
301{
302 char name [MAX_NAMELEN];
303
304 if (!root_irq_dir || !desc->dir)
305 return;
306#ifdef CONFIG_SMP
307 remove_proc_entry("smp_affinity", desc->dir);
308 remove_proc_entry("affinity_hint", desc->dir);
309 remove_proc_entry("node", desc->dir);
310#endif
311 remove_proc_entry("spurious", desc->dir);
312
313 memset(name, 0, MAX_NAMELEN);
314 sprintf(name, "%u", irq);
315 remove_proc_entry(name, root_irq_dir);
316}
317
300#undef MAX_NAMELEN 318#undef MAX_NAMELEN
301 319
302void unregister_handler_proc(unsigned int irq, struct irqaction *action) 320void unregister_handler_proc(unsigned int irq, struct irqaction *action)
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 090c3763f3a2..891115a929aa 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -60,7 +60,7 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
60 /* 60 /*
61 * Make sure the interrupt is enabled, before resending it: 61 * Make sure the interrupt is enabled, before resending it:
62 */ 62 */
63 desc->chip->enable(irq); 63 desc->irq_data.chip->irq_enable(&desc->irq_data);
64 64
65 /* 65 /*
66 * We do not resend level type interrupts. Level type 66 * We do not resend level type interrupts. Level type
@@ -70,7 +70,8 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
70 if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { 70 if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
71 desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY; 71 desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY;
72 72
73 if (!desc->chip->retrigger || !desc->chip->retrigger(irq)) { 73 if (!desc->irq_data.chip->irq_retrigger ||
74 !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) {
74#ifdef CONFIG_HARDIRQS_SW_RESEND 75#ifdef CONFIG_HARDIRQS_SW_RESEND
75 /* Set it pending and activate the softirq: */ 76 /* Set it pending and activate the softirq: */
76 set_bit(irq, irqs_resend); 77 set_bit(irq, irqs_resend);
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 89fb90ae534f..3089d3b9d5f3 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -14,6 +14,8 @@
14#include <linux/moduleparam.h> 14#include <linux/moduleparam.h>
15#include <linux/timer.h> 15#include <linux/timer.h>
16 16
17#include "internals.h"
18
17static int irqfixup __read_mostly; 19static int irqfixup __read_mostly;
18 20
19#define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10) 21#define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10)
@@ -78,8 +80,8 @@ static int try_one_irq(int irq, struct irq_desc *desc)
78 * If we did actual work for the real IRQ line we must let the 80 * If we did actual work for the real IRQ line we must let the
79 * IRQ controller clean up too 81 * IRQ controller clean up too
80 */ 82 */
81 if (work && desc->chip && desc->chip->end) 83 if (work)
82 desc->chip->end(irq); 84 irq_end(irq, desc);
83 raw_spin_unlock(&desc->lock); 85 raw_spin_unlock(&desc->lock);
84 86
85 return ok; 87 return ok;
@@ -254,7 +256,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
254 printk(KERN_EMERG "Disabling IRQ #%d\n", irq); 256 printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
255 desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED; 257 desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED;
256 desc->depth++; 258 desc->depth++;
257 desc->chip->disable(irq); 259 desc->irq_data.chip->irq_disable(&desc->irq_data);
258 260
259 mod_timer(&poll_spurious_irq_timer, 261 mod_timer(&poll_spurious_irq_timer,
260 jiffies + POLL_SPURIOUS_IRQ_INTERVAL); 262 jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index f2852a510232..42ba65dff7d9 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -639,6 +639,16 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
639 } 639 }
640#endif 640#endif
641 641
642 if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
643 debug_locks_off();
644 printk(KERN_ERR
645 "BUG: looking up invalid subclass: %u\n", subclass);
646 printk(KERN_ERR
647 "turning off the locking correctness validator.\n");
648 dump_stack();
649 return NULL;
650 }
651
642 /* 652 /*
643 * Static locks do not have their class-keys yet - for them the key 653 * Static locks do not have their class-keys yet - for them the key
644 * is the lock object itself: 654 * is the lock object itself:
@@ -774,7 +784,9 @@ out_unlock_set:
774 raw_local_irq_restore(flags); 784 raw_local_irq_restore(flags);
775 785
776 if (!subclass || force) 786 if (!subclass || force)
777 lock->class_cache = class; 787 lock->class_cache[0] = class;
788 else if (subclass < NR_LOCKDEP_CACHING_CLASSES)
789 lock->class_cache[subclass] = class;
778 790
779 if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass)) 791 if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass))
780 return NULL; 792 return NULL;
@@ -2679,7 +2691,11 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
2679void lockdep_init_map(struct lockdep_map *lock, const char *name, 2691void lockdep_init_map(struct lockdep_map *lock, const char *name,
2680 struct lock_class_key *key, int subclass) 2692 struct lock_class_key *key, int subclass)
2681{ 2693{
2682 lock->class_cache = NULL; 2694 int i;
2695
2696 for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++)
2697 lock->class_cache[i] = NULL;
2698
2683#ifdef CONFIG_LOCK_STAT 2699#ifdef CONFIG_LOCK_STAT
2684 lock->cpu = raw_smp_processor_id(); 2700 lock->cpu = raw_smp_processor_id();
2685#endif 2701#endif
@@ -2739,21 +2755,13 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2739 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 2755 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
2740 return 0; 2756 return 0;
2741 2757
2742 if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
2743 debug_locks_off();
2744 printk("BUG: MAX_LOCKDEP_SUBCLASSES too low!\n");
2745 printk("turning off the locking correctness validator.\n");
2746 dump_stack();
2747 return 0;
2748 }
2749
2750 if (lock->key == &__lockdep_no_validate__) 2758 if (lock->key == &__lockdep_no_validate__)
2751 check = 1; 2759 check = 1;
2752 2760
2753 if (!subclass) 2761 if (subclass < NR_LOCKDEP_CACHING_CLASSES)
2754 class = lock->class_cache; 2762 class = lock->class_cache[subclass];
2755 /* 2763 /*
2756 * Not cached yet or subclass? 2764 * Not cached?
2757 */ 2765 */
2758 if (unlikely(!class)) { 2766 if (unlikely(!class)) {
2759 class = register_lock_class(lock, subclass, 0); 2767 class = register_lock_class(lock, subclass, 0);
@@ -2918,7 +2926,7 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
2918 return 1; 2926 return 1;
2919 2927
2920 if (hlock->references) { 2928 if (hlock->references) {
2921 struct lock_class *class = lock->class_cache; 2929 struct lock_class *class = lock->class_cache[0];
2922 2930
2923 if (!class) 2931 if (!class)
2924 class = look_up_lock_class(lock, 0); 2932 class = look_up_lock_class(lock, 0);
@@ -3559,7 +3567,12 @@ void lockdep_reset_lock(struct lockdep_map *lock)
3559 if (list_empty(head)) 3567 if (list_empty(head))
3560 continue; 3568 continue;
3561 list_for_each_entry_safe(class, next, head, hash_entry) { 3569 list_for_each_entry_safe(class, next, head, hash_entry) {
3562 if (unlikely(class == lock->class_cache)) { 3570 int match = 0;
3571
3572 for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++)
3573 match |= class == lock->class_cache[j];
3574
3575 if (unlikely(match)) {
3563 if (debug_locks_off_graph_unlock()) 3576 if (debug_locks_off_graph_unlock())
3564 WARN_ON(1); 3577 WARN_ON(1);
3565 goto out_restore; 3578 goto out_restore;
@@ -3775,7 +3788,7 @@ EXPORT_SYMBOL_GPL(debug_show_all_locks);
3775 * Careful: only use this function if you are sure that 3788 * Careful: only use this function if you are sure that
3776 * the task cannot run in parallel! 3789 * the task cannot run in parallel!
3777 */ 3790 */
3778void __debug_show_held_locks(struct task_struct *task) 3791void debug_show_held_locks(struct task_struct *task)
3779{ 3792{
3780 if (unlikely(!debug_locks)) { 3793 if (unlikely(!debug_locks)) {
3781 printk("INFO: lockdep is turned off.\n"); 3794 printk("INFO: lockdep is turned off.\n");
@@ -3783,12 +3796,6 @@ void __debug_show_held_locks(struct task_struct *task)
3783 } 3796 }
3784 lockdep_print_held_locks(task); 3797 lockdep_print_held_locks(task);
3785} 3798}
3786EXPORT_SYMBOL_GPL(__debug_show_held_locks);
3787
3788void debug_show_held_locks(struct task_struct *task)
3789{
3790 __debug_show_held_locks(task);
3791}
3792EXPORT_SYMBOL_GPL(debug_show_held_locks); 3799EXPORT_SYMBOL_GPL(debug_show_held_locks);
3793 3800
3794void lockdep_sys_exit(void) 3801void lockdep_sys_exit(void)
diff --git a/kernel/pid.c b/kernel/pid.c
index d55c6fb8d087..39b65b69584f 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -401,7 +401,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
401 struct task_struct *result = NULL; 401 struct task_struct *result = NULL;
402 if (pid) { 402 if (pid) {
403 struct hlist_node *first; 403 struct hlist_node *first;
404 first = rcu_dereference_check(pid->tasks[type].first, 404 first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]),
405 rcu_read_lock_held() || 405 rcu_read_lock_held() ||
406 lockdep_tasklist_lock_is_held()); 406 lockdep_tasklist_lock_is_held());
407 if (first) 407 if (first)
@@ -416,6 +416,7 @@ EXPORT_SYMBOL(pid_task);
416 */ 416 */
417struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) 417struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
418{ 418{
419 rcu_lockdep_assert(rcu_read_lock_held());
419 return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); 420 return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
420} 421}
421 422
diff --git a/kernel/printk.c b/kernel/printk.c
index 8fe465ac008a..2531017795f6 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -85,7 +85,7 @@ EXPORT_SYMBOL(oops_in_progress);
85 * provides serialisation for access to the entire console 85 * provides serialisation for access to the entire console
86 * driver system. 86 * driver system.
87 */ 87 */
88static DECLARE_MUTEX(console_sem); 88static DEFINE_SEMAPHORE(console_sem);
89struct console *console_drivers; 89struct console *console_drivers;
90EXPORT_SYMBOL_GPL(console_drivers); 90EXPORT_SYMBOL_GPL(console_drivers);
91 91
@@ -556,7 +556,7 @@ static void zap_locks(void)
556 /* If a crash is occurring, make sure we can't deadlock */ 556 /* If a crash is occurring, make sure we can't deadlock */
557 spin_lock_init(&logbuf_lock); 557 spin_lock_init(&logbuf_lock);
558 /* And make sure that we print immediately */ 558 /* And make sure that we print immediately */
559 init_MUTEX(&console_sem); 559 sema_init(&console_sem, 1);
560} 560}
561 561
562#if defined(CONFIG_PRINTK_TIME) 562#if defined(CONFIG_PRINTK_TIME)
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 4d169835fb36..a23a57a976d1 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -73,12 +73,14 @@ int debug_lockdep_rcu_enabled(void)
73EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); 73EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
74 74
75/** 75/**
76 * rcu_read_lock_bh_held - might we be in RCU-bh read-side critical section? 76 * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section?
77 * 77 *
78 * Check for bottom half being disabled, which covers both the 78 * Check for bottom half being disabled, which covers both the
79 * CONFIG_PROVE_RCU and not cases. Note that if someone uses 79 * CONFIG_PROVE_RCU and not cases. Note that if someone uses
80 * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled) 80 * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled)
81 * will show the situation. 81 * will show the situation. This is useful for debug checks in functions
82 * that require that they be called within an RCU read-side critical
83 * section.
82 * 84 *
83 * Check debug_lockdep_rcu_enabled() to prevent false positives during boot. 85 * Check debug_lockdep_rcu_enabled() to prevent false positives during boot.
84 */ 86 */
@@ -86,7 +88,7 @@ int rcu_read_lock_bh_held(void)
86{ 88{
87 if (!debug_lockdep_rcu_enabled()) 89 if (!debug_lockdep_rcu_enabled())
88 return 1; 90 return 1;
89 return in_softirq(); 91 return in_softirq() || irqs_disabled();
90} 92}
91EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); 93EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
92 94
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 196ec02f8be0..d806735342ac 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -59,6 +59,14 @@ int rcu_scheduler_active __read_mostly;
59EXPORT_SYMBOL_GPL(rcu_scheduler_active); 59EXPORT_SYMBOL_GPL(rcu_scheduler_active);
60#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 60#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
61 61
62/* Forward declarations for rcutiny_plugin.h. */
63static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
64static void __call_rcu(struct rcu_head *head,
65 void (*func)(struct rcu_head *rcu),
66 struct rcu_ctrlblk *rcp);
67
68#include "rcutiny_plugin.h"
69
62#ifdef CONFIG_NO_HZ 70#ifdef CONFIG_NO_HZ
63 71
64static long rcu_dynticks_nesting = 1; 72static long rcu_dynticks_nesting = 1;
@@ -140,6 +148,7 @@ void rcu_check_callbacks(int cpu, int user)
140 rcu_sched_qs(cpu); 148 rcu_sched_qs(cpu);
141 else if (!in_softirq()) 149 else if (!in_softirq())
142 rcu_bh_qs(cpu); 150 rcu_bh_qs(cpu);
151 rcu_preempt_check_callbacks();
143} 152}
144 153
145/* 154/*
@@ -162,6 +171,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
162 *rcp->donetail = NULL; 171 *rcp->donetail = NULL;
163 if (rcp->curtail == rcp->donetail) 172 if (rcp->curtail == rcp->donetail)
164 rcp->curtail = &rcp->rcucblist; 173 rcp->curtail = &rcp->rcucblist;
174 rcu_preempt_remove_callbacks(rcp);
165 rcp->donetail = &rcp->rcucblist; 175 rcp->donetail = &rcp->rcucblist;
166 local_irq_restore(flags); 176 local_irq_restore(flags);
167 177
@@ -182,6 +192,7 @@ static void rcu_process_callbacks(struct softirq_action *unused)
182{ 192{
183 __rcu_process_callbacks(&rcu_sched_ctrlblk); 193 __rcu_process_callbacks(&rcu_sched_ctrlblk);
184 __rcu_process_callbacks(&rcu_bh_ctrlblk); 194 __rcu_process_callbacks(&rcu_bh_ctrlblk);
195 rcu_preempt_process_callbacks();
185} 196}
186 197
187/* 198/*
@@ -223,15 +234,15 @@ static void __call_rcu(struct rcu_head *head,
223} 234}
224 235
225/* 236/*
226 * Post an RCU callback to be invoked after the end of an RCU grace 237 * Post an RCU callback to be invoked after the end of an RCU-sched grace
227 * period. But since we have but one CPU, that would be after any 238 * period. But since we have but one CPU, that would be after any
228 * quiescent state. 239 * quiescent state.
229 */ 240 */
230void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 241void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
231{ 242{
232 __call_rcu(head, func, &rcu_sched_ctrlblk); 243 __call_rcu(head, func, &rcu_sched_ctrlblk);
233} 244}
234EXPORT_SYMBOL_GPL(call_rcu); 245EXPORT_SYMBOL_GPL(call_rcu_sched);
235 246
236/* 247/*
237 * Post an RCU bottom-half callback to be invoked after any subsequent 248 * Post an RCU bottom-half callback to be invoked after any subsequent
@@ -243,20 +254,6 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
243} 254}
244EXPORT_SYMBOL_GPL(call_rcu_bh); 255EXPORT_SYMBOL_GPL(call_rcu_bh);
245 256
246void rcu_barrier(void)
247{
248 struct rcu_synchronize rcu;
249
250 init_rcu_head_on_stack(&rcu.head);
251 init_completion(&rcu.completion);
252 /* Will wake me after RCU finished. */
253 call_rcu(&rcu.head, wakeme_after_rcu);
254 /* Wait for it. */
255 wait_for_completion(&rcu.completion);
256 destroy_rcu_head_on_stack(&rcu.head);
257}
258EXPORT_SYMBOL_GPL(rcu_barrier);
259
260void rcu_barrier_bh(void) 257void rcu_barrier_bh(void)
261{ 258{
262 struct rcu_synchronize rcu; 259 struct rcu_synchronize rcu;
@@ -289,5 +286,3 @@ void __init rcu_init(void)
289{ 286{
290 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 287 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
291} 288}
292
293#include "rcutiny_plugin.h"
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index d223a92bc742..6ceca4f745ff 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -1,7 +1,7 @@
1/* 1/*
2 * Read-Copy Update mechanism for mutual exclusion (tree-based version) 2 * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition
3 * Internal non-public definitions that provide either classic 3 * Internal non-public definitions that provide either classic
4 * or preemptable semantics. 4 * or preemptible semantics.
5 * 5 *
6 * This program is free software; you can redistribute it and/or modify 6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by 7 * it under the terms of the GNU General Public License as published by
@@ -17,11 +17,587 @@
17 * along with this program; if not, write to the Free Software 17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 * 19 *
20 * Copyright IBM Corporation, 2009 20 * Copyright (c) 2010 Linaro
21 * 21 *
22 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> 22 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
23 */ 23 */
24 24
25#ifdef CONFIG_TINY_PREEMPT_RCU
26
27#include <linux/delay.h>
28
29/* Global control variables for preemptible RCU. */
30struct rcu_preempt_ctrlblk {
31 struct rcu_ctrlblk rcb; /* curtail: ->next ptr of last CB for GP. */
32 struct rcu_head **nexttail;
33 /* Tasks blocked in a preemptible RCU */
34 /* read-side critical section while an */
35 /* preemptible-RCU grace period is in */
36 /* progress must wait for a later grace */
37 /* period. This pointer points to the */
38 /* ->next pointer of the last task that */
39 /* must wait for a later grace period, or */
40 /* to &->rcb.rcucblist if there is no */
41 /* such task. */
42 struct list_head blkd_tasks;
43 /* Tasks blocked in RCU read-side critical */
44 /* section. Tasks are placed at the head */
45 /* of this list and age towards the tail. */
46 struct list_head *gp_tasks;
47 /* Pointer to the first task blocking the */
48 /* current grace period, or NULL if there */
49 /* is not such task. */
50 struct list_head *exp_tasks;
51 /* Pointer to first task blocking the */
52 /* current expedited grace period, or NULL */
53 /* if there is no such task. If there */
54 /* is no current expedited grace period, */
55 /* then there cannot be any such task. */
56 u8 gpnum; /* Current grace period. */
57 u8 gpcpu; /* Last grace period blocked by the CPU. */
58 u8 completed; /* Last grace period completed. */
59 /* If all three are equal, RCU is idle. */
60};
61
62static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
63 .rcb.donetail = &rcu_preempt_ctrlblk.rcb.rcucblist,
64 .rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist,
65 .nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist,
66 .blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks),
67};
68
69static int rcu_preempted_readers_exp(void);
70static void rcu_report_exp_done(void);
71
72/*
73 * Return true if the CPU has not yet responded to the current grace period.
74 */
75static int rcu_cpu_blocking_cur_gp(void)
76{
77 return rcu_preempt_ctrlblk.gpcpu != rcu_preempt_ctrlblk.gpnum;
78}
79
80/*
81 * Check for a running RCU reader. Because there is only one CPU,
82 * there can be but one running RCU reader at a time. ;-)
83 */
84static int rcu_preempt_running_reader(void)
85{
86 return current->rcu_read_lock_nesting;
87}
88
89/*
90 * Check for preempted RCU readers blocking any grace period.
91 * If the caller needs a reliable answer, it must disable hard irqs.
92 */
93static int rcu_preempt_blocked_readers_any(void)
94{
95 return !list_empty(&rcu_preempt_ctrlblk.blkd_tasks);
96}
97
98/*
99 * Check for preempted RCU readers blocking the current grace period.
100 * If the caller needs a reliable answer, it must disable hard irqs.
101 */
102static int rcu_preempt_blocked_readers_cgp(void)
103{
104 return rcu_preempt_ctrlblk.gp_tasks != NULL;
105}
106
107/*
108 * Return true if another preemptible-RCU grace period is needed.
109 */
110static int rcu_preempt_needs_another_gp(void)
111{
112 return *rcu_preempt_ctrlblk.rcb.curtail != NULL;
113}
114
115/*
116 * Return true if a preemptible-RCU grace period is in progress.
117 * The caller must disable hardirqs.
118 */
119static int rcu_preempt_gp_in_progress(void)
120{
121 return rcu_preempt_ctrlblk.completed != rcu_preempt_ctrlblk.gpnum;
122}
123
124/*
125 * Record a preemptible-RCU quiescent state for the specified CPU. Note
126 * that this just means that the task currently running on the CPU is
127 * in a quiescent state. There might be any number of tasks blocked
128 * while in an RCU read-side critical section.
129 *
130 * Unlike the other rcu_*_qs() functions, callers to this function
131 * must disable irqs in order to protect the assignment to
132 * ->rcu_read_unlock_special.
133 *
134 * Because this is a single-CPU implementation, the only way a grace
135 * period can end is if the CPU is in a quiescent state. The reason is
136 * that a blocked preemptible-RCU reader can exit its critical section
137 * only if the CPU is running it at the time. Therefore, when the
138 * last task blocking the current grace period exits its RCU read-side
139 * critical section, neither the CPU nor blocked tasks will be stopping
140 * the current grace period. (In contrast, SMP implementations
141 * might have CPUs running in RCU read-side critical sections that
142 * block later grace periods -- but this is not possible given only
143 * one CPU.)
144 */
145static void rcu_preempt_cpu_qs(void)
146{
147 /* Record both CPU and task as having responded to current GP. */
148 rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum;
149 current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
150
151 /*
152 * If there is no GP, or if blocked readers are still blocking GP,
153 * then there is nothing more to do.
154 */
155 if (!rcu_preempt_gp_in_progress() || rcu_preempt_blocked_readers_cgp())
156 return;
157
158 /* Advance callbacks. */
159 rcu_preempt_ctrlblk.completed = rcu_preempt_ctrlblk.gpnum;
160 rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.rcb.curtail;
161 rcu_preempt_ctrlblk.rcb.curtail = rcu_preempt_ctrlblk.nexttail;
162
163 /* If there are no blocked readers, next GP is done instantly. */
164 if (!rcu_preempt_blocked_readers_any())
165 rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail;
166
167 /* If there are done callbacks, make RCU_SOFTIRQ process them. */
168 if (*rcu_preempt_ctrlblk.rcb.donetail != NULL)
169 raise_softirq(RCU_SOFTIRQ);
170}
171
172/*
173 * Start a new RCU grace period if warranted. Hard irqs must be disabled.
174 */
175static void rcu_preempt_start_gp(void)
176{
177 if (!rcu_preempt_gp_in_progress() && rcu_preempt_needs_another_gp()) {
178
179 /* Official start of GP. */
180 rcu_preempt_ctrlblk.gpnum++;
181
182 /* Any blocked RCU readers block new GP. */
183 if (rcu_preempt_blocked_readers_any())
184 rcu_preempt_ctrlblk.gp_tasks =
185 rcu_preempt_ctrlblk.blkd_tasks.next;
186
187 /* If there is no running reader, CPU is done with GP. */
188 if (!rcu_preempt_running_reader())
189 rcu_preempt_cpu_qs();
190 }
191}
192
193/*
194 * We have entered the scheduler, and the current task might soon be
195 * context-switched away from. If this task is in an RCU read-side
196 * critical section, we will no longer be able to rely on the CPU to
197 * record that fact, so we enqueue the task on the blkd_tasks list.
198 * If the task started after the current grace period began, as recorded
199 * by ->gpcpu, we enqueue at the beginning of the list. Otherwise
200 * before the element referenced by ->gp_tasks (or at the tail if
201 * ->gp_tasks is NULL) and point ->gp_tasks at the newly added element.
202 * The task will dequeue itself when it exits the outermost enclosing
203 * RCU read-side critical section. Therefore, the current grace period
204 * cannot be permitted to complete until the ->gp_tasks pointer becomes
205 * NULL.
206 *
207 * Caller must disable preemption.
208 */
209void rcu_preempt_note_context_switch(void)
210{
211 struct task_struct *t = current;
212 unsigned long flags;
213
214 local_irq_save(flags); /* must exclude scheduler_tick(). */
215 if (rcu_preempt_running_reader() &&
216 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
217
218 /* Possibly blocking in an RCU read-side critical section. */
219 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
220
221 /*
222 * If this CPU has already checked in, then this task
223 * will hold up the next grace period rather than the
224 * current grace period. Queue the task accordingly.
225 * If the task is queued for the current grace period
226 * (i.e., this CPU has not yet passed through a quiescent
227 * state for the current grace period), then as long
228 * as that task remains queued, the current grace period
229 * cannot end.
230 */
231 list_add(&t->rcu_node_entry, &rcu_preempt_ctrlblk.blkd_tasks);
232 if (rcu_cpu_blocking_cur_gp())
233 rcu_preempt_ctrlblk.gp_tasks = &t->rcu_node_entry;
234 }
235
236 /*
237 * Either we were not in an RCU read-side critical section to
238 * begin with, or we have now recorded that critical section
239 * globally. Either way, we can now note a quiescent state
240 * for this CPU. Again, if we were in an RCU read-side critical
241 * section, and if that critical section was blocking the current
242 * grace period, then the fact that the task has been enqueued
243 * means that current grace period continues to be blocked.
244 */
245 rcu_preempt_cpu_qs();
246 local_irq_restore(flags);
247}
248
249/*
250 * Tiny-preemptible RCU implementation for rcu_read_lock().
251 * Just increment ->rcu_read_lock_nesting, shared state will be updated
252 * if we block.
253 */
254void __rcu_read_lock(void)
255{
256 current->rcu_read_lock_nesting++;
257 barrier(); /* needed if we ever invoke rcu_read_lock in rcutiny.c */
258}
259EXPORT_SYMBOL_GPL(__rcu_read_lock);
260
261/*
262 * Handle special cases during rcu_read_unlock(), such as needing to
263 * notify RCU core processing or task having blocked during the RCU
264 * read-side critical section.
265 */
266static void rcu_read_unlock_special(struct task_struct *t)
267{
268 int empty;
269 int empty_exp;
270 unsigned long flags;
271 struct list_head *np;
272 int special;
273
274 /*
275 * NMI handlers cannot block and cannot safely manipulate state.
276 * They therefore cannot possibly be special, so just leave.
277 */
278 if (in_nmi())
279 return;
280
281 local_irq_save(flags);
282
283 /*
284 * If RCU core is waiting for this CPU to exit critical section,
285 * let it know that we have done so.
286 */
287 special = t->rcu_read_unlock_special;
288 if (special & RCU_READ_UNLOCK_NEED_QS)
289 rcu_preempt_cpu_qs();
290
291 /* Hardware IRQ handlers cannot block. */
292 if (in_irq()) {
293 local_irq_restore(flags);
294 return;
295 }
296
297 /* Clean up if blocked during RCU read-side critical section. */
298 if (special & RCU_READ_UNLOCK_BLOCKED) {
299 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED;
300
301 /*
302 * Remove this task from the ->blkd_tasks list and adjust
303 * any pointers that might have been referencing it.
304 */
305 empty = !rcu_preempt_blocked_readers_cgp();
306 empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL;
307 np = t->rcu_node_entry.next;
308 if (np == &rcu_preempt_ctrlblk.blkd_tasks)
309 np = NULL;
310 list_del(&t->rcu_node_entry);
311 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks)
312 rcu_preempt_ctrlblk.gp_tasks = np;
313 if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks)
314 rcu_preempt_ctrlblk.exp_tasks = np;
315 INIT_LIST_HEAD(&t->rcu_node_entry);
316
317 /*
318 * If this was the last task on the current list, and if
319 * we aren't waiting on the CPU, report the quiescent state
320 * and start a new grace period if needed.
321 */
322 if (!empty && !rcu_preempt_blocked_readers_cgp()) {
323 rcu_preempt_cpu_qs();
324 rcu_preempt_start_gp();
325 }
326
327 /*
328 * If this was the last task on the expedited lists,
329 * then we need wake up the waiting task.
330 */
331 if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL)
332 rcu_report_exp_done();
333 }
334 local_irq_restore(flags);
335}
336
337/*
338 * Tiny-preemptible RCU implementation for rcu_read_unlock().
339 * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
340 * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
341 * invoke rcu_read_unlock_special() to clean up after a context switch
342 * in an RCU read-side critical section and other special cases.
343 */
344void __rcu_read_unlock(void)
345{
346 struct task_struct *t = current;
347
348 barrier(); /* needed if we ever invoke rcu_read_unlock in rcutiny.c */
349 --t->rcu_read_lock_nesting;
350 barrier(); /* decrement before load of ->rcu_read_unlock_special */
351 if (t->rcu_read_lock_nesting == 0 &&
352 unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
353 rcu_read_unlock_special(t);
354#ifdef CONFIG_PROVE_LOCKING
355 WARN_ON_ONCE(t->rcu_read_lock_nesting < 0);
356#endif /* #ifdef CONFIG_PROVE_LOCKING */
357}
358EXPORT_SYMBOL_GPL(__rcu_read_unlock);
359
360/*
361 * Check for a quiescent state from the current CPU. When a task blocks,
362 * the task is recorded in the rcu_preempt_ctrlblk structure, which is
363 * checked elsewhere. This is called from the scheduling-clock interrupt.
364 *
365 * Caller must disable hard irqs.
366 */
367static void rcu_preempt_check_callbacks(void)
368{
369 struct task_struct *t = current;
370
371 if (rcu_preempt_gp_in_progress() &&
372 (!rcu_preempt_running_reader() ||
373 !rcu_cpu_blocking_cur_gp()))
374 rcu_preempt_cpu_qs();
375 if (&rcu_preempt_ctrlblk.rcb.rcucblist !=
376 rcu_preempt_ctrlblk.rcb.donetail)
377 raise_softirq(RCU_SOFTIRQ);
378 if (rcu_preempt_gp_in_progress() &&
379 rcu_cpu_blocking_cur_gp() &&
380 rcu_preempt_running_reader())
381 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS;
382}
383
384/*
385 * TINY_PREEMPT_RCU has an extra callback-list tail pointer to
386 * update, so this is invoked from __rcu_process_callbacks() to
387 * handle that case. Of course, it is invoked for all flavors of
388 * RCU, but RCU callbacks can appear only on one of the lists, and
389 * neither ->nexttail nor ->donetail can possibly be NULL, so there
390 * is no need for an explicit check.
391 */
392static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
393{
394 if (rcu_preempt_ctrlblk.nexttail == rcp->donetail)
395 rcu_preempt_ctrlblk.nexttail = &rcp->rcucblist;
396}
397
398/*
399 * Process callbacks for preemptible RCU.
400 */
401static void rcu_preempt_process_callbacks(void)
402{
403 __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
404}
405
406/*
407 * Queue a preemptible -RCU callback for invocation after a grace period.
408 */
409void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
410{
411 unsigned long flags;
412
413 debug_rcu_head_queue(head);
414 head->func = func;
415 head->next = NULL;
416
417 local_irq_save(flags);
418 *rcu_preempt_ctrlblk.nexttail = head;
419 rcu_preempt_ctrlblk.nexttail = &head->next;
420 rcu_preempt_start_gp(); /* checks to see if GP needed. */
421 local_irq_restore(flags);
422}
423EXPORT_SYMBOL_GPL(call_rcu);
424
425void rcu_barrier(void)
426{
427 struct rcu_synchronize rcu;
428
429 init_rcu_head_on_stack(&rcu.head);
430 init_completion(&rcu.completion);
431 /* Will wake me after RCU finished. */
432 call_rcu(&rcu.head, wakeme_after_rcu);
433 /* Wait for it. */
434 wait_for_completion(&rcu.completion);
435 destroy_rcu_head_on_stack(&rcu.head);
436}
437EXPORT_SYMBOL_GPL(rcu_barrier);
438
439/*
440 * synchronize_rcu - wait until a grace period has elapsed.
441 *
442 * Control will return to the caller some time after a full grace
443 * period has elapsed, in other words after all currently executing RCU
444 * read-side critical sections have completed. RCU read-side critical
445 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
446 * and may be nested.
447 */
448void synchronize_rcu(void)
449{
450#ifdef CONFIG_DEBUG_LOCK_ALLOC
451 if (!rcu_scheduler_active)
452 return;
453#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
454
455 WARN_ON_ONCE(rcu_preempt_running_reader());
456 if (!rcu_preempt_blocked_readers_any())
457 return;
458
459 /* Once we get past the fastpath checks, same code as rcu_barrier(). */
460 rcu_barrier();
461}
462EXPORT_SYMBOL_GPL(synchronize_rcu);
463
464static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
465static unsigned long sync_rcu_preempt_exp_count;
466static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
467
468/*
469 * Return non-zero if there are any tasks in RCU read-side critical
470 * sections blocking the current preemptible-RCU expedited grace period.
471 * If there is no preemptible-RCU expedited grace period currently in
472 * progress, returns zero unconditionally.
473 */
474static int rcu_preempted_readers_exp(void)
475{
476 return rcu_preempt_ctrlblk.exp_tasks != NULL;
477}
478
479/*
480 * Report the exit from RCU read-side critical section for the last task
481 * that queued itself during or before the current expedited preemptible-RCU
482 * grace period.
483 */
484static void rcu_report_exp_done(void)
485{
486 wake_up(&sync_rcu_preempt_exp_wq);
487}
488
489/*
490 * Wait for an rcu-preempt grace period, but expedite it. The basic idea
491 * is to rely in the fact that there is but one CPU, and that it is
492 * illegal for a task to invoke synchronize_rcu_expedited() while in a
493 * preemptible-RCU read-side critical section. Therefore, any such
494 * critical sections must correspond to blocked tasks, which must therefore
495 * be on the ->blkd_tasks list. So just record the current head of the
496 * list in the ->exp_tasks pointer, and wait for all tasks including and
497 * after the task pointed to by ->exp_tasks to drain.
498 */
499void synchronize_rcu_expedited(void)
500{
501 unsigned long flags;
502 struct rcu_preempt_ctrlblk *rpcp = &rcu_preempt_ctrlblk;
503 unsigned long snap;
504
505 barrier(); /* ensure prior action seen before grace period. */
506
507 WARN_ON_ONCE(rcu_preempt_running_reader());
508
509 /*
510 * Acquire lock so that there is only one preemptible RCU grace
511 * period in flight. Of course, if someone does the expedited
512 * grace period for us while we are acquiring the lock, just leave.
513 */
514 snap = sync_rcu_preempt_exp_count + 1;
515 mutex_lock(&sync_rcu_preempt_exp_mutex);
516 if (ULONG_CMP_LT(snap, sync_rcu_preempt_exp_count))
517 goto unlock_mb_ret; /* Others did our work for us. */
518
519 local_irq_save(flags);
520
521 /*
522 * All RCU readers have to already be on blkd_tasks because
523 * we cannot legally be executing in an RCU read-side critical
524 * section.
525 */
526
527 /* Snapshot current head of ->blkd_tasks list. */
528 rpcp->exp_tasks = rpcp->blkd_tasks.next;
529 if (rpcp->exp_tasks == &rpcp->blkd_tasks)
530 rpcp->exp_tasks = NULL;
531 local_irq_restore(flags);
532
533 /* Wait for tail of ->blkd_tasks list to drain. */
534 if (rcu_preempted_readers_exp())
535 wait_event(sync_rcu_preempt_exp_wq,
536 !rcu_preempted_readers_exp());
537
538 /* Clean up and exit. */
539 barrier(); /* ensure expedited GP seen before counter increment. */
540 sync_rcu_preempt_exp_count++;
541unlock_mb_ret:
542 mutex_unlock(&sync_rcu_preempt_exp_mutex);
543 barrier(); /* ensure subsequent action seen after grace period. */
544}
545EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
546
547/*
548 * Does preemptible RCU need the CPU to stay out of dynticks mode?
549 */
550int rcu_preempt_needs_cpu(void)
551{
552 if (!rcu_preempt_running_reader())
553 rcu_preempt_cpu_qs();
554 return rcu_preempt_ctrlblk.rcb.rcucblist != NULL;
555}
556
557/*
558 * Check for a task exiting while in a preemptible -RCU read-side
559 * critical section, clean up if so. No need to issue warnings,
560 * as debug_check_no_locks_held() already does this if lockdep
561 * is enabled.
562 */
563void exit_rcu(void)
564{
565 struct task_struct *t = current;
566
567 if (t->rcu_read_lock_nesting == 0)
568 return;
569 t->rcu_read_lock_nesting = 1;
570 rcu_read_unlock();
571}
572
573#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
574
575/*
576 * Because preemptible RCU does not exist, it never has any callbacks
577 * to check.
578 */
579static void rcu_preempt_check_callbacks(void)
580{
581}
582
583/*
584 * Because preemptible RCU does not exist, it never has any callbacks
585 * to remove.
586 */
587static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
588{
589}
590
591/*
592 * Because preemptible RCU does not exist, it never has any callbacks
593 * to process.
594 */
595static void rcu_preempt_process_callbacks(void)
596{
597}
598
599#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */
600
25#ifdef CONFIG_DEBUG_LOCK_ALLOC 601#ifdef CONFIG_DEBUG_LOCK_ALLOC
26 602
27#include <linux/kernel_stat.h> 603#include <linux/kernel_stat.h>
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 2e2726d790b9..9d8e8fb2515f 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -120,7 +120,7 @@ struct rcu_torture {
120}; 120};
121 121
122static LIST_HEAD(rcu_torture_freelist); 122static LIST_HEAD(rcu_torture_freelist);
123static struct rcu_torture *rcu_torture_current; 123static struct rcu_torture __rcu *rcu_torture_current;
124static long rcu_torture_current_version; 124static long rcu_torture_current_version;
125static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; 125static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
126static DEFINE_SPINLOCK(rcu_torture_lock); 126static DEFINE_SPINLOCK(rcu_torture_lock);
@@ -153,8 +153,10 @@ int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
153#define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */ 153#define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */
154#define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */ 154#define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */
155static int fullstop = FULLSTOP_RMMOD; 155static int fullstop = FULLSTOP_RMMOD;
156DEFINE_MUTEX(fullstop_mutex); /* Protect fullstop transitions and spawning */ 156/*
157 /* of kthreads. */ 157 * Protect fullstop transitions and spawning of kthreads.
158 */
159static DEFINE_MUTEX(fullstop_mutex);
158 160
159/* 161/*
160 * Detect and respond to a system shutdown. 162 * Detect and respond to a system shutdown.
@@ -303,6 +305,10 @@ static void rcu_read_delay(struct rcu_random_state *rrsp)
303 mdelay(longdelay_ms); 305 mdelay(longdelay_ms);
304 if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) 306 if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us)))
305 udelay(shortdelay_us); 307 udelay(shortdelay_us);
308#ifdef CONFIG_PREEMPT
309 if (!preempt_count() && !(rcu_random(rrsp) % (nrealreaders * 20000)))
310 preempt_schedule(); /* No QS if preempt_disable() in effect */
311#endif
306} 312}
307 313
308static void rcu_torture_read_unlock(int idx) __releases(RCU) 314static void rcu_torture_read_unlock(int idx) __releases(RCU)
@@ -536,6 +542,8 @@ static void srcu_read_delay(struct rcu_random_state *rrsp)
536 delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick); 542 delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick);
537 if (!delay) 543 if (!delay)
538 schedule_timeout_interruptible(longdelay); 544 schedule_timeout_interruptible(longdelay);
545 else
546 rcu_read_delay(rrsp);
539} 547}
540 548
541static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl) 549static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl)
@@ -731,7 +739,8 @@ rcu_torture_writer(void *arg)
731 continue; 739 continue;
732 rp->rtort_pipe_count = 0; 740 rp->rtort_pipe_count = 0;
733 udelay(rcu_random(&rand) & 0x3ff); 741 udelay(rcu_random(&rand) & 0x3ff);
734 old_rp = rcu_torture_current; 742 old_rp = rcu_dereference_check(rcu_torture_current,
743 current == writer_task);
735 rp->rtort_mbtest = 1; 744 rp->rtort_mbtest = 1;
736 rcu_assign_pointer(rcu_torture_current, rp); 745 rcu_assign_pointer(rcu_torture_current, rp);
737 smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */ 746 smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index d5bc43976c5a..ccdc04c47981 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -143,6 +143,11 @@ module_param(blimit, int, 0);
143module_param(qhimark, int, 0); 143module_param(qhimark, int, 0);
144module_param(qlowmark, int, 0); 144module_param(qlowmark, int, 0);
145 145
146#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
147int rcu_cpu_stall_suppress __read_mostly = RCU_CPU_STALL_SUPPRESS_INIT;
148module_param(rcu_cpu_stall_suppress, int, 0644);
149#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
150
146static void force_quiescent_state(struct rcu_state *rsp, int relaxed); 151static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
147static int rcu_pending(int cpu); 152static int rcu_pending(int cpu);
148 153
@@ -450,7 +455,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
450 455
451#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 456#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
452 457
453int rcu_cpu_stall_panicking __read_mostly; 458int rcu_cpu_stall_suppress __read_mostly;
454 459
455static void record_gp_stall_check_time(struct rcu_state *rsp) 460static void record_gp_stall_check_time(struct rcu_state *rsp)
456{ 461{
@@ -482,8 +487,11 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
482 rcu_print_task_stall(rnp); 487 rcu_print_task_stall(rnp);
483 raw_spin_unlock_irqrestore(&rnp->lock, flags); 488 raw_spin_unlock_irqrestore(&rnp->lock, flags);
484 489
485 /* OK, time to rat on our buddy... */ 490 /*
486 491 * OK, time to rat on our buddy...
492 * See Documentation/RCU/stallwarn.txt for info on how to debug
493 * RCU CPU stall warnings.
494 */
487 printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {", 495 printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {",
488 rsp->name); 496 rsp->name);
489 rcu_for_each_leaf_node(rsp, rnp) { 497 rcu_for_each_leaf_node(rsp, rnp) {
@@ -512,6 +520,11 @@ static void print_cpu_stall(struct rcu_state *rsp)
512 unsigned long flags; 520 unsigned long flags;
513 struct rcu_node *rnp = rcu_get_root(rsp); 521 struct rcu_node *rnp = rcu_get_root(rsp);
514 522
523 /*
524 * OK, time to rat on ourselves...
525 * See Documentation/RCU/stallwarn.txt for info on how to debug
526 * RCU CPU stall warnings.
527 */
515 printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n", 528 printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n",
516 rsp->name, smp_processor_id(), jiffies - rsp->gp_start); 529 rsp->name, smp_processor_id(), jiffies - rsp->gp_start);
517 trigger_all_cpu_backtrace(); 530 trigger_all_cpu_backtrace();
@@ -530,11 +543,11 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
530 long delta; 543 long delta;
531 struct rcu_node *rnp; 544 struct rcu_node *rnp;
532 545
533 if (rcu_cpu_stall_panicking) 546 if (rcu_cpu_stall_suppress)
534 return; 547 return;
535 delta = jiffies - rsp->jiffies_stall; 548 delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall);
536 rnp = rdp->mynode; 549 rnp = rdp->mynode;
537 if ((rnp->qsmask & rdp->grpmask) && delta >= 0) { 550 if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && delta >= 0) {
538 551
539 /* We haven't checked in, so go dump stack. */ 552 /* We haven't checked in, so go dump stack. */
540 print_cpu_stall(rsp); 553 print_cpu_stall(rsp);
@@ -548,10 +561,26 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
548 561
549static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) 562static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
550{ 563{
551 rcu_cpu_stall_panicking = 1; 564 rcu_cpu_stall_suppress = 1;
552 return NOTIFY_DONE; 565 return NOTIFY_DONE;
553} 566}
554 567
568/**
569 * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
570 *
571 * Set the stall-warning timeout way off into the future, thus preventing
572 * any RCU CPU stall-warning messages from appearing in the current set of
573 * RCU grace periods.
574 *
575 * The caller must disable hard irqs.
576 */
577void rcu_cpu_stall_reset(void)
578{
579 rcu_sched_state.jiffies_stall = jiffies + ULONG_MAX / 2;
580 rcu_bh_state.jiffies_stall = jiffies + ULONG_MAX / 2;
581 rcu_preempt_stall_reset();
582}
583
555static struct notifier_block rcu_panic_block = { 584static struct notifier_block rcu_panic_block = {
556 .notifier_call = rcu_panic, 585 .notifier_call = rcu_panic,
557}; 586};
@@ -571,6 +600,10 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
571{ 600{
572} 601}
573 602
603void rcu_cpu_stall_reset(void)
604{
605}
606
574static void __init check_cpu_stall_init(void) 607static void __init check_cpu_stall_init(void)
575{ 608{
576} 609}
@@ -712,7 +745,7 @@ static void
712rcu_start_gp(struct rcu_state *rsp, unsigned long flags) 745rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
713 __releases(rcu_get_root(rsp)->lock) 746 __releases(rcu_get_root(rsp)->lock)
714{ 747{
715 struct rcu_data *rdp = rsp->rda[smp_processor_id()]; 748 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
716 struct rcu_node *rnp = rcu_get_root(rsp); 749 struct rcu_node *rnp = rcu_get_root(rsp);
717 750
718 if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) { 751 if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) {
@@ -960,7 +993,7 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
960static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) 993static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
961{ 994{
962 int i; 995 int i;
963 struct rcu_data *rdp = rsp->rda[smp_processor_id()]; 996 struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
964 997
965 if (rdp->nxtlist == NULL) 998 if (rdp->nxtlist == NULL)
966 return; /* irqs disabled, so comparison is stable. */ 999 return; /* irqs disabled, so comparison is stable. */
@@ -971,6 +1004,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
971 for (i = 0; i < RCU_NEXT_SIZE; i++) 1004 for (i = 0; i < RCU_NEXT_SIZE; i++)
972 rdp->nxttail[i] = &rdp->nxtlist; 1005 rdp->nxttail[i] = &rdp->nxtlist;
973 rsp->orphan_qlen += rdp->qlen; 1006 rsp->orphan_qlen += rdp->qlen;
1007 rdp->n_cbs_orphaned += rdp->qlen;
974 rdp->qlen = 0; 1008 rdp->qlen = 0;
975 raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ 1009 raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
976} 1010}
@@ -984,7 +1018,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
984 struct rcu_data *rdp; 1018 struct rcu_data *rdp;
985 1019
986 raw_spin_lock_irqsave(&rsp->onofflock, flags); 1020 raw_spin_lock_irqsave(&rsp->onofflock, flags);
987 rdp = rsp->rda[smp_processor_id()]; 1021 rdp = this_cpu_ptr(rsp->rda);
988 if (rsp->orphan_cbs_list == NULL) { 1022 if (rsp->orphan_cbs_list == NULL) {
989 raw_spin_unlock_irqrestore(&rsp->onofflock, flags); 1023 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
990 return; 1024 return;
@@ -992,6 +1026,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
992 *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list; 1026 *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list;
993 rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail; 1027 rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail;
994 rdp->qlen += rsp->orphan_qlen; 1028 rdp->qlen += rsp->orphan_qlen;
1029 rdp->n_cbs_adopted += rsp->orphan_qlen;
995 rsp->orphan_cbs_list = NULL; 1030 rsp->orphan_cbs_list = NULL;
996 rsp->orphan_cbs_tail = &rsp->orphan_cbs_list; 1031 rsp->orphan_cbs_tail = &rsp->orphan_cbs_list;
997 rsp->orphan_qlen = 0; 1032 rsp->orphan_qlen = 0;
@@ -1007,7 +1042,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
1007 unsigned long flags; 1042 unsigned long flags;
1008 unsigned long mask; 1043 unsigned long mask;
1009 int need_report = 0; 1044 int need_report = 0;
1010 struct rcu_data *rdp = rsp->rda[cpu]; 1045 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
1011 struct rcu_node *rnp; 1046 struct rcu_node *rnp;
1012 1047
1013 /* Exclude any attempts to start a new grace period. */ 1048 /* Exclude any attempts to start a new grace period. */
@@ -1123,6 +1158,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1123 1158
1124 /* Update count, and requeue any remaining callbacks. */ 1159 /* Update count, and requeue any remaining callbacks. */
1125 rdp->qlen -= count; 1160 rdp->qlen -= count;
1161 rdp->n_cbs_invoked += count;
1126 if (list != NULL) { 1162 if (list != NULL) {
1127 *tail = rdp->nxtlist; 1163 *tail = rdp->nxtlist;
1128 rdp->nxtlist = list; 1164 rdp->nxtlist = list;
@@ -1226,7 +1262,8 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
1226 cpu = rnp->grplo; 1262 cpu = rnp->grplo;
1227 bit = 1; 1263 bit = 1;
1228 for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { 1264 for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
1229 if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu])) 1265 if ((rnp->qsmask & bit) != 0 &&
1266 f(per_cpu_ptr(rsp->rda, cpu)))
1230 mask |= bit; 1267 mask |= bit;
1231 } 1268 }
1232 if (mask != 0) { 1269 if (mask != 0) {
@@ -1402,7 +1439,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1402 * a quiescent state betweentimes. 1439 * a quiescent state betweentimes.
1403 */ 1440 */
1404 local_irq_save(flags); 1441 local_irq_save(flags);
1405 rdp = rsp->rda[smp_processor_id()]; 1442 rdp = this_cpu_ptr(rsp->rda);
1406 rcu_process_gp_end(rsp, rdp); 1443 rcu_process_gp_end(rsp, rdp);
1407 check_for_new_grace_period(rsp, rdp); 1444 check_for_new_grace_period(rsp, rdp);
1408 1445
@@ -1701,7 +1738,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
1701{ 1738{
1702 unsigned long flags; 1739 unsigned long flags;
1703 int i; 1740 int i;
1704 struct rcu_data *rdp = rsp->rda[cpu]; 1741 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
1705 struct rcu_node *rnp = rcu_get_root(rsp); 1742 struct rcu_node *rnp = rcu_get_root(rsp);
1706 1743
1707 /* Set up local state, ensuring consistent view of global state. */ 1744 /* Set up local state, ensuring consistent view of global state. */
@@ -1729,7 +1766,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1729{ 1766{
1730 unsigned long flags; 1767 unsigned long flags;
1731 unsigned long mask; 1768 unsigned long mask;
1732 struct rcu_data *rdp = rsp->rda[cpu]; 1769 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
1733 struct rcu_node *rnp = rcu_get_root(rsp); 1770 struct rcu_node *rnp = rcu_get_root(rsp);
1734 1771
1735 /* Set up local state, ensuring consistent view of global state. */ 1772 /* Set up local state, ensuring consistent view of global state. */
@@ -1865,7 +1902,8 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
1865/* 1902/*
1866 * Helper function for rcu_init() that initializes one rcu_state structure. 1903 * Helper function for rcu_init() that initializes one rcu_state structure.
1867 */ 1904 */
1868static void __init rcu_init_one(struct rcu_state *rsp) 1905static void __init rcu_init_one(struct rcu_state *rsp,
1906 struct rcu_data __percpu *rda)
1869{ 1907{
1870 static char *buf[] = { "rcu_node_level_0", 1908 static char *buf[] = { "rcu_node_level_0",
1871 "rcu_node_level_1", 1909 "rcu_node_level_1",
@@ -1918,37 +1956,23 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1918 } 1956 }
1919 } 1957 }
1920 1958
1959 rsp->rda = rda;
1921 rnp = rsp->level[NUM_RCU_LVLS - 1]; 1960 rnp = rsp->level[NUM_RCU_LVLS - 1];
1922 for_each_possible_cpu(i) { 1961 for_each_possible_cpu(i) {
1923 while (i > rnp->grphi) 1962 while (i > rnp->grphi)
1924 rnp++; 1963 rnp++;
1925 rsp->rda[i]->mynode = rnp; 1964 per_cpu_ptr(rsp->rda, i)->mynode = rnp;
1926 rcu_boot_init_percpu_data(i, rsp); 1965 rcu_boot_init_percpu_data(i, rsp);
1927 } 1966 }
1928} 1967}
1929 1968
1930/*
1931 * Helper macro for __rcu_init() and __rcu_init_preempt(). To be used
1932 * nowhere else! Assigns leaf node pointers into each CPU's rcu_data
1933 * structure.
1934 */
1935#define RCU_INIT_FLAVOR(rsp, rcu_data) \
1936do { \
1937 int i; \
1938 \
1939 for_each_possible_cpu(i) { \
1940 (rsp)->rda[i] = &per_cpu(rcu_data, i); \
1941 } \
1942 rcu_init_one(rsp); \
1943} while (0)
1944
1945void __init rcu_init(void) 1969void __init rcu_init(void)
1946{ 1970{
1947 int cpu; 1971 int cpu;
1948 1972
1949 rcu_bootup_announce(); 1973 rcu_bootup_announce();
1950 RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data); 1974 rcu_init_one(&rcu_sched_state, &rcu_sched_data);
1951 RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data); 1975 rcu_init_one(&rcu_bh_state, &rcu_bh_data);
1952 __rcu_init_preempt(); 1976 __rcu_init_preempt();
1953 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 1977 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1954 1978
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 14c040b18ed0..91d4170c5c13 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -202,6 +202,9 @@ struct rcu_data {
202 long qlen; /* # of queued callbacks */ 202 long qlen; /* # of queued callbacks */
203 long qlen_last_fqs_check; 203 long qlen_last_fqs_check;
204 /* qlen at last check for QS forcing */ 204 /* qlen at last check for QS forcing */
205 unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */
206 unsigned long n_cbs_orphaned; /* RCU cbs sent to orphanage. */
207 unsigned long n_cbs_adopted; /* RCU cbs adopted from orphanage. */
205 unsigned long n_force_qs_snap; 208 unsigned long n_force_qs_snap;
206 /* did other CPU force QS recently? */ 209 /* did other CPU force QS recently? */
207 long blimit; /* Upper limit on a processed batch */ 210 long blimit; /* Upper limit on a processed batch */
@@ -254,19 +257,23 @@ struct rcu_data {
254#define RCU_STALL_DELAY_DELTA 0 257#define RCU_STALL_DELAY_DELTA 0
255#endif 258#endif
256 259
257#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ + RCU_STALL_DELAY_DELTA) 260#define RCU_SECONDS_TILL_STALL_CHECK (CONFIG_RCU_CPU_STALL_TIMEOUT * HZ + \
261 RCU_STALL_DELAY_DELTA)
258 /* for rsp->jiffies_stall */ 262 /* for rsp->jiffies_stall */
259#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ + RCU_STALL_DELAY_DELTA) 263#define RCU_SECONDS_TILL_STALL_RECHECK (3 * RCU_SECONDS_TILL_STALL_CHECK + 30)
260 /* for rsp->jiffies_stall */ 264 /* for rsp->jiffies_stall */
261#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ 265#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */
262 /* to take at least one */ 266 /* to take at least one */
263 /* scheduling clock irq */ 267 /* scheduling clock irq */
264 /* before ratting on them. */ 268 /* before ratting on them. */
265 269
266#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 270#ifdef CONFIG_RCU_CPU_STALL_DETECTOR_RUNNABLE
271#define RCU_CPU_STALL_SUPPRESS_INIT 0
272#else
273#define RCU_CPU_STALL_SUPPRESS_INIT 1
274#endif
267 275
268#define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) 276#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
269#define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b))
270 277
271/* 278/*
272 * RCU global state, including node hierarchy. This hierarchy is 279 * RCU global state, including node hierarchy. This hierarchy is
@@ -283,7 +290,7 @@ struct rcu_state {
283 struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */ 290 struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */
284 u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ 291 u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */
285 u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */ 292 u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */
286 struct rcu_data *rda[NR_CPUS]; /* array of rdp pointers. */ 293 struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
287 294
288 /* The following fields are guarded by the root rcu_node's lock. */ 295 /* The following fields are guarded by the root rcu_node's lock. */
289 296
@@ -365,6 +372,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
365#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 372#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
366static void rcu_print_detail_task_stall(struct rcu_state *rsp); 373static void rcu_print_detail_task_stall(struct rcu_state *rsp);
367static void rcu_print_task_stall(struct rcu_node *rnp); 374static void rcu_print_task_stall(struct rcu_node *rnp);
375static void rcu_preempt_stall_reset(void);
368#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 376#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
369static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); 377static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
370#ifdef CONFIG_HOTPLUG_CPU 378#ifdef CONFIG_HOTPLUG_CPU
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 0e4f420245d9..71a4147473f9 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -57,7 +57,7 @@ static void __init rcu_bootup_announce_oddness(void)
57 printk(KERN_INFO 57 printk(KERN_INFO
58 "\tRCU-based detection of stalled CPUs is disabled.\n"); 58 "\tRCU-based detection of stalled CPUs is disabled.\n");
59#endif 59#endif
60#ifndef CONFIG_RCU_CPU_STALL_VERBOSE 60#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE)
61 printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n"); 61 printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n");
62#endif 62#endif
63#if NUM_RCU_LVL_4 != 0 63#if NUM_RCU_LVL_4 != 0
@@ -154,7 +154,7 @@ static void rcu_preempt_note_context_switch(int cpu)
154 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { 154 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) {
155 155
156 /* Possibly blocking in an RCU read-side critical section. */ 156 /* Possibly blocking in an RCU read-side critical section. */
157 rdp = rcu_preempt_state.rda[cpu]; 157 rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu);
158 rnp = rdp->mynode; 158 rnp = rdp->mynode;
159 raw_spin_lock_irqsave(&rnp->lock, flags); 159 raw_spin_lock_irqsave(&rnp->lock, flags);
160 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; 160 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
@@ -201,7 +201,7 @@ static void rcu_preempt_note_context_switch(int cpu)
201 */ 201 */
202void __rcu_read_lock(void) 202void __rcu_read_lock(void)
203{ 203{
204 ACCESS_ONCE(current->rcu_read_lock_nesting)++; 204 current->rcu_read_lock_nesting++;
205 barrier(); /* needed if we ever invoke rcu_read_lock in rcutree.c */ 205 barrier(); /* needed if we ever invoke rcu_read_lock in rcutree.c */
206} 206}
207EXPORT_SYMBOL_GPL(__rcu_read_lock); 207EXPORT_SYMBOL_GPL(__rcu_read_lock);
@@ -344,7 +344,9 @@ void __rcu_read_unlock(void)
344 struct task_struct *t = current; 344 struct task_struct *t = current;
345 345
346 barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */ 346 barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */
347 if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 && 347 --t->rcu_read_lock_nesting;
348 barrier(); /* decrement before load of ->rcu_read_unlock_special */
349 if (t->rcu_read_lock_nesting == 0 &&
348 unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) 350 unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
349 rcu_read_unlock_special(t); 351 rcu_read_unlock_special(t);
350#ifdef CONFIG_PROVE_LOCKING 352#ifdef CONFIG_PROVE_LOCKING
@@ -417,6 +419,16 @@ static void rcu_print_task_stall(struct rcu_node *rnp)
417 } 419 }
418} 420}
419 421
422/*
423 * Suppress preemptible RCU's CPU stall warnings by pushing the
424 * time of the next stall-warning message comfortably far into the
425 * future.
426 */
427static void rcu_preempt_stall_reset(void)
428{
429 rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2;
430}
431
420#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 432#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
421 433
422/* 434/*
@@ -546,9 +558,11 @@ EXPORT_SYMBOL_GPL(call_rcu);
546 * 558 *
547 * Control will return to the caller some time after a full grace 559 * Control will return to the caller some time after a full grace
548 * period has elapsed, in other words after all currently executing RCU 560 * period has elapsed, in other words after all currently executing RCU
549 * read-side critical sections have completed. RCU read-side critical 561 * read-side critical sections have completed. Note, however, that
550 * sections are delimited by rcu_read_lock() and rcu_read_unlock(), 562 * upon return from synchronize_rcu(), the caller might well be executing
551 * and may be nested. 563 * concurrently with new RCU read-side critical sections that began while
564 * synchronize_rcu() was waiting. RCU read-side critical sections are
565 * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
552 */ 566 */
553void synchronize_rcu(void) 567void synchronize_rcu(void)
554{ 568{
@@ -771,7 +785,7 @@ static void rcu_preempt_send_cbs_to_orphanage(void)
771 */ 785 */
772static void __init __rcu_init_preempt(void) 786static void __init __rcu_init_preempt(void)
773{ 787{
774 RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data); 788 rcu_init_one(&rcu_preempt_state, &rcu_preempt_data);
775} 789}
776 790
777/* 791/*
@@ -865,6 +879,14 @@ static void rcu_print_task_stall(struct rcu_node *rnp)
865{ 879{
866} 880}
867 881
882/*
883 * Because preemptible RCU does not exist, there is no need to suppress
884 * its CPU stall warnings.
885 */
886static void rcu_preempt_stall_reset(void)
887{
888}
889
868#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 890#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
869 891
870/* 892/*
@@ -919,15 +941,6 @@ static void rcu_preempt_process_callbacks(void)
919} 941}
920 942
921/* 943/*
922 * In classic RCU, call_rcu() is just call_rcu_sched().
923 */
924void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
925{
926 call_rcu_sched(head, func);
927}
928EXPORT_SYMBOL_GPL(call_rcu);
929
930/*
931 * Wait for an rcu-preempt grace period, but make it happen quickly. 944 * Wait for an rcu-preempt grace period, but make it happen quickly.
932 * But because preemptable RCU does not exist, map to rcu-sched. 945 * But because preemptable RCU does not exist, map to rcu-sched.
933 */ 946 */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 36c95b45738e..d15430b9d122 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -64,7 +64,9 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
64 rdp->dynticks_fqs); 64 rdp->dynticks_fqs);
65#endif /* #ifdef CONFIG_NO_HZ */ 65#endif /* #ifdef CONFIG_NO_HZ */
66 seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); 66 seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
67 seq_printf(m, " ql=%ld b=%ld\n", rdp->qlen, rdp->blimit); 67 seq_printf(m, " ql=%ld b=%ld", rdp->qlen, rdp->blimit);
68 seq_printf(m, " ci=%lu co=%lu ca=%lu\n",
69 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
68} 70}
69 71
70#define PRINT_RCU_DATA(name, func, m) \ 72#define PRINT_RCU_DATA(name, func, m) \
@@ -119,7 +121,9 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
119 rdp->dynticks_fqs); 121 rdp->dynticks_fqs);
120#endif /* #ifdef CONFIG_NO_HZ */ 122#endif /* #ifdef CONFIG_NO_HZ */
121 seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); 123 seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
122 seq_printf(m, ",%ld,%ld\n", rdp->qlen, rdp->blimit); 124 seq_printf(m, ",%ld,%ld", rdp->qlen, rdp->blimit);
125 seq_printf(m, ",%lu,%lu,%lu\n",
126 rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
123} 127}
124 128
125static int show_rcudata_csv(struct seq_file *m, void *unused) 129static int show_rcudata_csv(struct seq_file *m, void *unused)
@@ -128,7 +132,7 @@ static int show_rcudata_csv(struct seq_file *m, void *unused)
128#ifdef CONFIG_NO_HZ 132#ifdef CONFIG_NO_HZ
129 seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\","); 133 seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\",");
130#endif /* #ifdef CONFIG_NO_HZ */ 134#endif /* #ifdef CONFIG_NO_HZ */
131 seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\"\n"); 135 seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\",\"ci\",\"co\",\"ca\"\n");
132#ifdef CONFIG_TREE_PREEMPT_RCU 136#ifdef CONFIG_TREE_PREEMPT_RCU
133 seq_puts(m, "\"rcu_preempt:\"\n"); 137 seq_puts(m, "\"rcu_preempt:\"\n");
134 PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m); 138 PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m);
@@ -262,7 +266,7 @@ static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp)
262 struct rcu_data *rdp; 266 struct rcu_data *rdp;
263 267
264 for_each_possible_cpu(cpu) { 268 for_each_possible_cpu(cpu) {
265 rdp = rsp->rda[cpu]; 269 rdp = per_cpu_ptr(rsp->rda, cpu);
266 if (rdp->beenonline) 270 if (rdp->beenonline)
267 print_one_rcu_pending(m, rdp); 271 print_one_rcu_pending(m, rdp);
268 } 272 }
diff --git a/kernel/sched.c b/kernel/sched.c
index c0d2067f3e0d..d42992bccdfa 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -426,9 +426,7 @@ struct root_domain {
426 */ 426 */
427 cpumask_var_t rto_mask; 427 cpumask_var_t rto_mask;
428 atomic_t rto_count; 428 atomic_t rto_count;
429#ifdef CONFIG_SMP
430 struct cpupri cpupri; 429 struct cpupri cpupri;
431#endif
432}; 430};
433 431
434/* 432/*
@@ -437,7 +435,7 @@ struct root_domain {
437 */ 435 */
438static struct root_domain def_root_domain; 436static struct root_domain def_root_domain;
439 437
440#endif 438#endif /* CONFIG_SMP */
441 439
442/* 440/*
443 * This is the main, per-CPU runqueue data structure. 441 * This is the main, per-CPU runqueue data structure.
@@ -488,11 +486,12 @@ struct rq {
488 */ 486 */
489 unsigned long nr_uninterruptible; 487 unsigned long nr_uninterruptible;
490 488
491 struct task_struct *curr, *idle; 489 struct task_struct *curr, *idle, *stop;
492 unsigned long next_balance; 490 unsigned long next_balance;
493 struct mm_struct *prev_mm; 491 struct mm_struct *prev_mm;
494 492
495 u64 clock; 493 u64 clock;
494 u64 clock_task;
496 495
497 atomic_t nr_iowait; 496 atomic_t nr_iowait;
498 497
@@ -520,6 +519,10 @@ struct rq {
520 u64 avg_idle; 519 u64 avg_idle;
521#endif 520#endif
522 521
522#ifdef CONFIG_IRQ_TIME_ACCOUNTING
523 u64 prev_irq_time;
524#endif
525
523 /* calc_load related fields */ 526 /* calc_load related fields */
524 unsigned long calc_load_update; 527 unsigned long calc_load_update;
525 long calc_load_active; 528 long calc_load_active;
@@ -643,10 +646,22 @@ static inline struct task_group *task_group(struct task_struct *p)
643 646
644#endif /* CONFIG_CGROUP_SCHED */ 647#endif /* CONFIG_CGROUP_SCHED */
645 648
649static u64 irq_time_cpu(int cpu);
650static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
651
646inline void update_rq_clock(struct rq *rq) 652inline void update_rq_clock(struct rq *rq)
647{ 653{
648 if (!rq->skip_clock_update) 654 if (!rq->skip_clock_update) {
649 rq->clock = sched_clock_cpu(cpu_of(rq)); 655 int cpu = cpu_of(rq);
656 u64 irq_time;
657
658 rq->clock = sched_clock_cpu(cpu);
659 irq_time = irq_time_cpu(cpu);
660 if (rq->clock - irq_time > rq->clock_task)
661 rq->clock_task = rq->clock - irq_time;
662
663 sched_irq_time_avg_update(rq, irq_time);
664 }
650} 665}
651 666
652/* 667/*
@@ -723,7 +738,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
723 size_t cnt, loff_t *ppos) 738 size_t cnt, loff_t *ppos)
724{ 739{
725 char buf[64]; 740 char buf[64];
726 char *cmp = buf; 741 char *cmp;
727 int neg = 0; 742 int neg = 0;
728 int i; 743 int i;
729 744
@@ -734,6 +749,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
734 return -EFAULT; 749 return -EFAULT;
735 750
736 buf[cnt] = 0; 751 buf[cnt] = 0;
752 cmp = strstrip(buf);
737 753
738 if (strncmp(buf, "NO_", 3) == 0) { 754 if (strncmp(buf, "NO_", 3) == 0) {
739 neg = 1; 755 neg = 1;
@@ -741,9 +757,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
741 } 757 }
742 758
743 for (i = 0; sched_feat_names[i]; i++) { 759 for (i = 0; sched_feat_names[i]; i++) {
744 int len = strlen(sched_feat_names[i]); 760 if (strcmp(cmp, sched_feat_names[i]) == 0) {
745
746 if (strncmp(cmp, sched_feat_names[i], len) == 0) {
747 if (neg) 761 if (neg)
748 sysctl_sched_features &= ~(1UL << i); 762 sysctl_sched_features &= ~(1UL << i);
749 else 763 else
@@ -1840,7 +1854,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1840 1854
1841static const struct sched_class rt_sched_class; 1855static const struct sched_class rt_sched_class;
1842 1856
1843#define sched_class_highest (&rt_sched_class) 1857#define sched_class_highest (&stop_sched_class)
1844#define for_each_class(class) \ 1858#define for_each_class(class) \
1845 for (class = sched_class_highest; class; class = class->next) 1859 for (class = sched_class_highest; class; class = class->next)
1846 1860
@@ -1858,12 +1872,6 @@ static void dec_nr_running(struct rq *rq)
1858 1872
1859static void set_load_weight(struct task_struct *p) 1873static void set_load_weight(struct task_struct *p)
1860{ 1874{
1861 if (task_has_rt_policy(p)) {
1862 p->se.load.weight = 0;
1863 p->se.load.inv_weight = WMULT_CONST;
1864 return;
1865 }
1866
1867 /* 1875 /*
1868 * SCHED_IDLE tasks get minimal weight: 1876 * SCHED_IDLE tasks get minimal weight:
1869 */ 1877 */
@@ -1917,13 +1925,132 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1917 dec_nr_running(rq); 1925 dec_nr_running(rq);
1918} 1926}
1919 1927
1928#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1929
1930/*
1931 * There are no locks covering percpu hardirq/softirq time.
1932 * They are only modified in account_system_vtime, on corresponding CPU
1933 * with interrupts disabled. So, writes are safe.
1934 * They are read and saved off onto struct rq in update_rq_clock().
1935 * This may result in other CPU reading this CPU's irq time and can
1936 * race with irq/account_system_vtime on this CPU. We would either get old
1937 * or new value (or semi updated value on 32 bit) with a side effect of
1938 * accounting a slice of irq time to wrong task when irq is in progress
1939 * while we read rq->clock. That is a worthy compromise in place of having
1940 * locks on each irq in account_system_time.
1941 */
1942static DEFINE_PER_CPU(u64, cpu_hardirq_time);
1943static DEFINE_PER_CPU(u64, cpu_softirq_time);
1944
1945static DEFINE_PER_CPU(u64, irq_start_time);
1946static int sched_clock_irqtime;
1947
1948void enable_sched_clock_irqtime(void)
1949{
1950 sched_clock_irqtime = 1;
1951}
1952
1953void disable_sched_clock_irqtime(void)
1954{
1955 sched_clock_irqtime = 0;
1956}
1957
1958static u64 irq_time_cpu(int cpu)
1959{
1960 if (!sched_clock_irqtime)
1961 return 0;
1962
1963 return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
1964}
1965
1966void account_system_vtime(struct task_struct *curr)
1967{
1968 unsigned long flags;
1969 int cpu;
1970 u64 now, delta;
1971
1972 if (!sched_clock_irqtime)
1973 return;
1974
1975 local_irq_save(flags);
1976
1977 cpu = smp_processor_id();
1978 now = sched_clock_cpu(cpu);
1979 delta = now - per_cpu(irq_start_time, cpu);
1980 per_cpu(irq_start_time, cpu) = now;
1981 /*
1982 * We do not account for softirq time from ksoftirqd here.
1983 * We want to continue accounting softirq time to ksoftirqd thread
1984 * in that case, so as not to confuse scheduler with a special task
1985 * that do not consume any time, but still wants to run.
1986 */
1987 if (hardirq_count())
1988 per_cpu(cpu_hardirq_time, cpu) += delta;
1989 else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
1990 per_cpu(cpu_softirq_time, cpu) += delta;
1991
1992 local_irq_restore(flags);
1993}
1994EXPORT_SYMBOL_GPL(account_system_vtime);
1995
1996static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time)
1997{
1998 if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) {
1999 u64 delta_irq = curr_irq_time - rq->prev_irq_time;
2000 rq->prev_irq_time = curr_irq_time;
2001 sched_rt_avg_update(rq, delta_irq);
2002 }
2003}
2004
2005#else
2006
2007static u64 irq_time_cpu(int cpu)
2008{
2009 return 0;
2010}
2011
2012static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { }
2013
2014#endif
2015
1920#include "sched_idletask.c" 2016#include "sched_idletask.c"
1921#include "sched_fair.c" 2017#include "sched_fair.c"
1922#include "sched_rt.c" 2018#include "sched_rt.c"
2019#include "sched_stoptask.c"
1923#ifdef CONFIG_SCHED_DEBUG 2020#ifdef CONFIG_SCHED_DEBUG
1924# include "sched_debug.c" 2021# include "sched_debug.c"
1925#endif 2022#endif
1926 2023
2024void sched_set_stop_task(int cpu, struct task_struct *stop)
2025{
2026 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
2027 struct task_struct *old_stop = cpu_rq(cpu)->stop;
2028
2029 if (stop) {
2030 /*
2031 * Make it appear like a SCHED_FIFO task, its something
2032 * userspace knows about and won't get confused about.
2033 *
2034 * Also, it will make PI more or less work without too
2035 * much confusion -- but then, stop work should not
2036 * rely on PI working anyway.
2037 */
2038 sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
2039
2040 stop->sched_class = &stop_sched_class;
2041 }
2042
2043 cpu_rq(cpu)->stop = stop;
2044
2045 if (old_stop) {
2046 /*
2047 * Reset it back to a normal scheduling class so that
2048 * it can die in pieces.
2049 */
2050 old_stop->sched_class = &rt_sched_class;
2051 }
2052}
2053
1927/* 2054/*
1928 * __normal_prio - return the priority that is based on the static prio 2055 * __normal_prio - return the priority that is based on the static prio
1929 */ 2056 */
@@ -2003,6 +2130,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2003 if (p->sched_class != &fair_sched_class) 2130 if (p->sched_class != &fair_sched_class)
2004 return 0; 2131 return 0;
2005 2132
2133 if (unlikely(p->policy == SCHED_IDLE))
2134 return 0;
2135
2006 /* 2136 /*
2007 * Buddy candidates are cache hot: 2137 * Buddy candidates are cache hot:
2008 */ 2138 */
@@ -2852,14 +2982,14 @@ context_switch(struct rq *rq, struct task_struct *prev,
2852 */ 2982 */
2853 arch_start_context_switch(prev); 2983 arch_start_context_switch(prev);
2854 2984
2855 if (likely(!mm)) { 2985 if (!mm) {
2856 next->active_mm = oldmm; 2986 next->active_mm = oldmm;
2857 atomic_inc(&oldmm->mm_count); 2987 atomic_inc(&oldmm->mm_count);
2858 enter_lazy_tlb(oldmm, next); 2988 enter_lazy_tlb(oldmm, next);
2859 } else 2989 } else
2860 switch_mm(oldmm, mm, next); 2990 switch_mm(oldmm, mm, next);
2861 2991
2862 if (likely(!prev->mm)) { 2992 if (!prev->mm) {
2863 prev->active_mm = NULL; 2993 prev->active_mm = NULL;
2864 rq->prev_mm = oldmm; 2994 rq->prev_mm = oldmm;
2865 } 2995 }
@@ -3248,7 +3378,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
3248 3378
3249 if (task_current(rq, p)) { 3379 if (task_current(rq, p)) {
3250 update_rq_clock(rq); 3380 update_rq_clock(rq);
3251 ns = rq->clock - p->se.exec_start; 3381 ns = rq->clock_task - p->se.exec_start;
3252 if ((s64)ns < 0) 3382 if ((s64)ns < 0)
3253 ns = 0; 3383 ns = 0;
3254 } 3384 }
@@ -3397,7 +3527,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
3397 tmp = cputime_to_cputime64(cputime); 3527 tmp = cputime_to_cputime64(cputime);
3398 if (hardirq_count() - hardirq_offset) 3528 if (hardirq_count() - hardirq_offset)
3399 cpustat->irq = cputime64_add(cpustat->irq, tmp); 3529 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3400 else if (softirq_count()) 3530 else if (in_serving_softirq())
3401 cpustat->softirq = cputime64_add(cpustat->softirq, tmp); 3531 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3402 else 3532 else
3403 cpustat->system = cputime64_add(cpustat->system, tmp); 3533 cpustat->system = cputime64_add(cpustat->system, tmp);
@@ -3723,17 +3853,13 @@ pick_next_task(struct rq *rq)
3723 return p; 3853 return p;
3724 } 3854 }
3725 3855
3726 class = sched_class_highest; 3856 for_each_class(class) {
3727 for ( ; ; ) {
3728 p = class->pick_next_task(rq); 3857 p = class->pick_next_task(rq);
3729 if (p) 3858 if (p)
3730 return p; 3859 return p;
3731 /*
3732 * Will never be NULL as the idle class always
3733 * returns a non-NULL p:
3734 */
3735 class = class->next;
3736 } 3860 }
3861
3862 BUG(); /* the idle class will always have a runnable task */
3737} 3863}
3738 3864
3739/* 3865/*
@@ -4358,6 +4484,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4358 4484
4359 rq = task_rq_lock(p, &flags); 4485 rq = task_rq_lock(p, &flags);
4360 4486
4487 trace_sched_pi_setprio(p, prio);
4361 oldprio = p->prio; 4488 oldprio = p->prio;
4362 prev_class = p->sched_class; 4489 prev_class = p->sched_class;
4363 on_rq = p->se.on_rq; 4490 on_rq = p->se.on_rq;
@@ -4645,7 +4772,7 @@ recheck:
4645 } 4772 }
4646 4773
4647 if (user) { 4774 if (user) {
4648 retval = security_task_setscheduler(p, policy, param); 4775 retval = security_task_setscheduler(p);
4649 if (retval) 4776 if (retval)
4650 return retval; 4777 return retval;
4651 } 4778 }
@@ -4661,6 +4788,15 @@ recheck:
4661 */ 4788 */
4662 rq = __task_rq_lock(p); 4789 rq = __task_rq_lock(p);
4663 4790
4791 /*
4792 * Changing the policy of the stop threads its a very bad idea
4793 */
4794 if (p == rq->stop) {
4795 __task_rq_unlock(rq);
4796 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4797 return -EINVAL;
4798 }
4799
4664#ifdef CONFIG_RT_GROUP_SCHED 4800#ifdef CONFIG_RT_GROUP_SCHED
4665 if (user) { 4801 if (user) {
4666 /* 4802 /*
@@ -4887,13 +5023,13 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
4887 if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) 5023 if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
4888 goto out_unlock; 5024 goto out_unlock;
4889 5025
4890 retval = security_task_setscheduler(p, 0, NULL); 5026 retval = security_task_setscheduler(p);
4891 if (retval) 5027 if (retval)
4892 goto out_unlock; 5028 goto out_unlock;
4893 5029
4894 cpuset_cpus_allowed(p, cpus_allowed); 5030 cpuset_cpus_allowed(p, cpus_allowed);
4895 cpumask_and(new_mask, in_mask, cpus_allowed); 5031 cpumask_and(new_mask, in_mask, cpus_allowed);
4896 again: 5032again:
4897 retval = set_cpus_allowed_ptr(p, new_mask); 5033 retval = set_cpus_allowed_ptr(p, new_mask);
4898 5034
4899 if (!retval) { 5035 if (!retval) {
@@ -5337,7 +5473,19 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5337 idle->se.exec_start = sched_clock(); 5473 idle->se.exec_start = sched_clock();
5338 5474
5339 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); 5475 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
5476 /*
5477 * We're having a chicken and egg problem, even though we are
5478 * holding rq->lock, the cpu isn't yet set to this cpu so the
5479 * lockdep check in task_group() will fail.
5480 *
5481 * Similar case to sched_fork(). / Alternatively we could
5482 * use task_rq_lock() here and obtain the other rq->lock.
5483 *
5484 * Silence PROVE_RCU
5485 */
5486 rcu_read_lock();
5340 __set_task_cpu(idle, cpu); 5487 __set_task_cpu(idle, cpu);
5488 rcu_read_unlock();
5341 5489
5342 rq->curr = rq->idle = idle; 5490 rq->curr = rq->idle = idle;
5343#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 5491#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
@@ -6514,6 +6662,7 @@ struct s_data {
6514 cpumask_var_t nodemask; 6662 cpumask_var_t nodemask;
6515 cpumask_var_t this_sibling_map; 6663 cpumask_var_t this_sibling_map;
6516 cpumask_var_t this_core_map; 6664 cpumask_var_t this_core_map;
6665 cpumask_var_t this_book_map;
6517 cpumask_var_t send_covered; 6666 cpumask_var_t send_covered;
6518 cpumask_var_t tmpmask; 6667 cpumask_var_t tmpmask;
6519 struct sched_group **sched_group_nodes; 6668 struct sched_group **sched_group_nodes;
@@ -6525,6 +6674,7 @@ enum s_alloc {
6525 sa_rootdomain, 6674 sa_rootdomain,
6526 sa_tmpmask, 6675 sa_tmpmask,
6527 sa_send_covered, 6676 sa_send_covered,
6677 sa_this_book_map,
6528 sa_this_core_map, 6678 sa_this_core_map,
6529 sa_this_sibling_map, 6679 sa_this_sibling_map,
6530 sa_nodemask, 6680 sa_nodemask,
@@ -6560,31 +6710,48 @@ cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
6560#ifdef CONFIG_SCHED_MC 6710#ifdef CONFIG_SCHED_MC
6561static DEFINE_PER_CPU(struct static_sched_domain, core_domains); 6711static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
6562static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); 6712static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
6563#endif /* CONFIG_SCHED_MC */
6564 6713
6565#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
6566static int 6714static int
6567cpu_to_core_group(int cpu, const struct cpumask *cpu_map, 6715cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
6568 struct sched_group **sg, struct cpumask *mask) 6716 struct sched_group **sg, struct cpumask *mask)
6569{ 6717{
6570 int group; 6718 int group;
6571 6719#ifdef CONFIG_SCHED_SMT
6572 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); 6720 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6573 group = cpumask_first(mask); 6721 group = cpumask_first(mask);
6722#else
6723 group = cpu;
6724#endif
6574 if (sg) 6725 if (sg)
6575 *sg = &per_cpu(sched_group_core, group).sg; 6726 *sg = &per_cpu(sched_group_core, group).sg;
6576 return group; 6727 return group;
6577} 6728}
6578#elif defined(CONFIG_SCHED_MC) 6729#endif /* CONFIG_SCHED_MC */
6730
6731/*
6732 * book sched-domains:
6733 */
6734#ifdef CONFIG_SCHED_BOOK
6735static DEFINE_PER_CPU(struct static_sched_domain, book_domains);
6736static DEFINE_PER_CPU(struct static_sched_group, sched_group_book);
6737
6579static int 6738static int
6580cpu_to_core_group(int cpu, const struct cpumask *cpu_map, 6739cpu_to_book_group(int cpu, const struct cpumask *cpu_map,
6581 struct sched_group **sg, struct cpumask *unused) 6740 struct sched_group **sg, struct cpumask *mask)
6582{ 6741{
6742 int group = cpu;
6743#ifdef CONFIG_SCHED_MC
6744 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
6745 group = cpumask_first(mask);
6746#elif defined(CONFIG_SCHED_SMT)
6747 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6748 group = cpumask_first(mask);
6749#endif
6583 if (sg) 6750 if (sg)
6584 *sg = &per_cpu(sched_group_core, cpu).sg; 6751 *sg = &per_cpu(sched_group_book, group).sg;
6585 return cpu; 6752 return group;
6586} 6753}
6587#endif 6754#endif /* CONFIG_SCHED_BOOK */
6588 6755
6589static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); 6756static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
6590static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); 6757static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
@@ -6594,7 +6761,10 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
6594 struct sched_group **sg, struct cpumask *mask) 6761 struct sched_group **sg, struct cpumask *mask)
6595{ 6762{
6596 int group; 6763 int group;
6597#ifdef CONFIG_SCHED_MC 6764#ifdef CONFIG_SCHED_BOOK
6765 cpumask_and(mask, cpu_book_mask(cpu), cpu_map);
6766 group = cpumask_first(mask);
6767#elif defined(CONFIG_SCHED_MC)
6598 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); 6768 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
6599 group = cpumask_first(mask); 6769 group = cpumask_first(mask);
6600#elif defined(CONFIG_SCHED_SMT) 6770#elif defined(CONFIG_SCHED_SMT)
@@ -6855,6 +7025,9 @@ SD_INIT_FUNC(CPU)
6855#ifdef CONFIG_SCHED_MC 7025#ifdef CONFIG_SCHED_MC
6856 SD_INIT_FUNC(MC) 7026 SD_INIT_FUNC(MC)
6857#endif 7027#endif
7028#ifdef CONFIG_SCHED_BOOK
7029 SD_INIT_FUNC(BOOK)
7030#endif
6858 7031
6859static int default_relax_domain_level = -1; 7032static int default_relax_domain_level = -1;
6860 7033
@@ -6904,6 +7077,8 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
6904 free_cpumask_var(d->tmpmask); /* fall through */ 7077 free_cpumask_var(d->tmpmask); /* fall through */
6905 case sa_send_covered: 7078 case sa_send_covered:
6906 free_cpumask_var(d->send_covered); /* fall through */ 7079 free_cpumask_var(d->send_covered); /* fall through */
7080 case sa_this_book_map:
7081 free_cpumask_var(d->this_book_map); /* fall through */
6907 case sa_this_core_map: 7082 case sa_this_core_map:
6908 free_cpumask_var(d->this_core_map); /* fall through */ 7083 free_cpumask_var(d->this_core_map); /* fall through */
6909 case sa_this_sibling_map: 7084 case sa_this_sibling_map:
@@ -6950,8 +7125,10 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
6950 return sa_nodemask; 7125 return sa_nodemask;
6951 if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) 7126 if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
6952 return sa_this_sibling_map; 7127 return sa_this_sibling_map;
6953 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) 7128 if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL))
6954 return sa_this_core_map; 7129 return sa_this_core_map;
7130 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
7131 return sa_this_book_map;
6955 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) 7132 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
6956 return sa_send_covered; 7133 return sa_send_covered;
6957 d->rd = alloc_rootdomain(); 7134 d->rd = alloc_rootdomain();
@@ -7009,6 +7186,23 @@ static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
7009 return sd; 7186 return sd;
7010} 7187}
7011 7188
7189static struct sched_domain *__build_book_sched_domain(struct s_data *d,
7190 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
7191 struct sched_domain *parent, int i)
7192{
7193 struct sched_domain *sd = parent;
7194#ifdef CONFIG_SCHED_BOOK
7195 sd = &per_cpu(book_domains, i).sd;
7196 SD_INIT(sd, BOOK);
7197 set_domain_attribute(sd, attr);
7198 cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i));
7199 sd->parent = parent;
7200 parent->child = sd;
7201 cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask);
7202#endif
7203 return sd;
7204}
7205
7012static struct sched_domain *__build_mc_sched_domain(struct s_data *d, 7206static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
7013 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7207 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
7014 struct sched_domain *parent, int i) 7208 struct sched_domain *parent, int i)
@@ -7066,6 +7260,15 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
7066 d->send_covered, d->tmpmask); 7260 d->send_covered, d->tmpmask);
7067 break; 7261 break;
7068#endif 7262#endif
7263#ifdef CONFIG_SCHED_BOOK
7264 case SD_LV_BOOK: /* set up book groups */
7265 cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu));
7266 if (cpu == cpumask_first(d->this_book_map))
7267 init_sched_build_groups(d->this_book_map, cpu_map,
7268 &cpu_to_book_group,
7269 d->send_covered, d->tmpmask);
7270 break;
7271#endif
7069 case SD_LV_CPU: /* set up physical groups */ 7272 case SD_LV_CPU: /* set up physical groups */
7070 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); 7273 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
7071 if (!cpumask_empty(d->nodemask)) 7274 if (!cpumask_empty(d->nodemask))
@@ -7113,12 +7316,14 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
7113 7316
7114 sd = __build_numa_sched_domains(&d, cpu_map, attr, i); 7317 sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
7115 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); 7318 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
7319 sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i);
7116 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); 7320 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
7117 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); 7321 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
7118 } 7322 }
7119 7323
7120 for_each_cpu(i, cpu_map) { 7324 for_each_cpu(i, cpu_map) {
7121 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); 7325 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
7326 build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);
7122 build_sched_groups(&d, SD_LV_MC, cpu_map, i); 7327 build_sched_groups(&d, SD_LV_MC, cpu_map, i);
7123 } 7328 }
7124 7329
@@ -7149,6 +7354,12 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
7149 init_sched_groups_power(i, sd); 7354 init_sched_groups_power(i, sd);
7150 } 7355 }
7151#endif 7356#endif
7357#ifdef CONFIG_SCHED_BOOK
7358 for_each_cpu(i, cpu_map) {
7359 sd = &per_cpu(book_domains, i).sd;
7360 init_sched_groups_power(i, sd);
7361 }
7362#endif
7152 7363
7153 for_each_cpu(i, cpu_map) { 7364 for_each_cpu(i, cpu_map) {
7154 sd = &per_cpu(phys_domains, i).sd; 7365 sd = &per_cpu(phys_domains, i).sd;
@@ -7174,6 +7385,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
7174 sd = &per_cpu(cpu_domains, i).sd; 7385 sd = &per_cpu(cpu_domains, i).sd;
7175#elif defined(CONFIG_SCHED_MC) 7386#elif defined(CONFIG_SCHED_MC)
7176 sd = &per_cpu(core_domains, i).sd; 7387 sd = &per_cpu(core_domains, i).sd;
7388#elif defined(CONFIG_SCHED_BOOK)
7389 sd = &per_cpu(book_domains, i).sd;
7177#else 7390#else
7178 sd = &per_cpu(phys_domains, i).sd; 7391 sd = &per_cpu(phys_domains, i).sd;
7179#endif 7392#endif
@@ -8078,9 +8291,9 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8078 8291
8079 return 1; 8292 return 1;
8080 8293
8081 err_free_rq: 8294err_free_rq:
8082 kfree(cfs_rq); 8295 kfree(cfs_rq);
8083 err: 8296err:
8084 return 0; 8297 return 0;
8085} 8298}
8086 8299
@@ -8168,9 +8381,9 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8168 8381
8169 return 1; 8382 return 1;
8170 8383
8171 err_free_rq: 8384err_free_rq:
8172 kfree(rt_rq); 8385 kfree(rt_rq);
8173 err: 8386err:
8174 return 0; 8387 return 0;
8175} 8388}
8176 8389
@@ -8528,7 +8741,7 @@ static int tg_set_bandwidth(struct task_group *tg,
8528 raw_spin_unlock(&rt_rq->rt_runtime_lock); 8741 raw_spin_unlock(&rt_rq->rt_runtime_lock);
8529 } 8742 }
8530 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); 8743 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
8531 unlock: 8744unlock:
8532 read_unlock(&tasklist_lock); 8745 read_unlock(&tasklist_lock);
8533 mutex_unlock(&rt_constraints_mutex); 8746 mutex_unlock(&rt_constraints_mutex);
8534 8747
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index db3f674ca49d..933f3d1b62ea 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -25,7 +25,7 @@
25 25
26/* 26/*
27 * Targeted preemption latency for CPU-bound tasks: 27 * Targeted preemption latency for CPU-bound tasks:
28 * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds) 28 * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
29 * 29 *
30 * NOTE: this latency value is not the same as the concept of 30 * NOTE: this latency value is not the same as the concept of
31 * 'timeslice length' - timeslices in CFS are of variable length 31 * 'timeslice length' - timeslices in CFS are of variable length
@@ -52,7 +52,7 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling
52 52
53/* 53/*
54 * Minimal preemption granularity for CPU-bound tasks: 54 * Minimal preemption granularity for CPU-bound tasks:
55 * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds) 55 * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
56 */ 56 */
57unsigned int sysctl_sched_min_granularity = 750000ULL; 57unsigned int sysctl_sched_min_granularity = 750000ULL;
58unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; 58unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
@@ -519,7 +519,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
519static void update_curr(struct cfs_rq *cfs_rq) 519static void update_curr(struct cfs_rq *cfs_rq)
520{ 520{
521 struct sched_entity *curr = cfs_rq->curr; 521 struct sched_entity *curr = cfs_rq->curr;
522 u64 now = rq_of(cfs_rq)->clock; 522 u64 now = rq_of(cfs_rq)->clock_task;
523 unsigned long delta_exec; 523 unsigned long delta_exec;
524 524
525 if (unlikely(!curr)) 525 if (unlikely(!curr))
@@ -602,7 +602,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
602 /* 602 /*
603 * We are starting a new run period: 603 * We are starting a new run period:
604 */ 604 */
605 se->exec_start = rq_of(cfs_rq)->clock; 605 se->exec_start = rq_of(cfs_rq)->clock_task;
606} 606}
607 607
608/************************************************** 608/**************************************************
@@ -1764,6 +1764,10 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
1764 set_task_cpu(p, this_cpu); 1764 set_task_cpu(p, this_cpu);
1765 activate_task(this_rq, p, 0); 1765 activate_task(this_rq, p, 0);
1766 check_preempt_curr(this_rq, p, 0); 1766 check_preempt_curr(this_rq, p, 0);
1767
1768 /* re-arm NEWIDLE balancing when moving tasks */
1769 src_rq->avg_idle = this_rq->avg_idle = 2*sysctl_sched_migration_cost;
1770 this_rq->idle_stamp = 0;
1767} 1771}
1768 1772
1769/* 1773/*
@@ -1798,7 +1802,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
1798 * 2) too many balance attempts have failed. 1802 * 2) too many balance attempts have failed.
1799 */ 1803 */
1800 1804
1801 tsk_cache_hot = task_hot(p, rq->clock, sd); 1805 tsk_cache_hot = task_hot(p, rq->clock_task, sd);
1802 if (!tsk_cache_hot || 1806 if (!tsk_cache_hot ||
1803 sd->nr_balance_failed > sd->cache_nice_tries) { 1807 sd->nr_balance_failed > sd->cache_nice_tries) {
1804#ifdef CONFIG_SCHEDSTATS 1808#ifdef CONFIG_SCHEDSTATS
@@ -2030,12 +2034,14 @@ struct sd_lb_stats {
2030 unsigned long this_load; 2034 unsigned long this_load;
2031 unsigned long this_load_per_task; 2035 unsigned long this_load_per_task;
2032 unsigned long this_nr_running; 2036 unsigned long this_nr_running;
2037 unsigned long this_has_capacity;
2033 2038
2034 /* Statistics of the busiest group */ 2039 /* Statistics of the busiest group */
2035 unsigned long max_load; 2040 unsigned long max_load;
2036 unsigned long busiest_load_per_task; 2041 unsigned long busiest_load_per_task;
2037 unsigned long busiest_nr_running; 2042 unsigned long busiest_nr_running;
2038 unsigned long busiest_group_capacity; 2043 unsigned long busiest_group_capacity;
2044 unsigned long busiest_has_capacity;
2039 2045
2040 int group_imb; /* Is there imbalance in this sd */ 2046 int group_imb; /* Is there imbalance in this sd */
2041#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 2047#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -2058,6 +2064,7 @@ struct sg_lb_stats {
2058 unsigned long sum_weighted_load; /* Weighted load of group's tasks */ 2064 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
2059 unsigned long group_capacity; 2065 unsigned long group_capacity;
2060 int group_imb; /* Is there an imbalance in the group ? */ 2066 int group_imb; /* Is there an imbalance in the group ? */
2067 int group_has_capacity; /* Is there extra capacity in the group? */
2061}; 2068};
2062 2069
2063/** 2070/**
@@ -2268,7 +2275,13 @@ unsigned long scale_rt_power(int cpu)
2268 u64 total, available; 2275 u64 total, available;
2269 2276
2270 total = sched_avg_period() + (rq->clock - rq->age_stamp); 2277 total = sched_avg_period() + (rq->clock - rq->age_stamp);
2271 available = total - rq->rt_avg; 2278
2279 if (unlikely(total < rq->rt_avg)) {
2280 /* Ensures that power won't end up being negative */
2281 available = 0;
2282 } else {
2283 available = total - rq->rt_avg;
2284 }
2272 2285
2273 if (unlikely((s64)total < SCHED_LOAD_SCALE)) 2286 if (unlikely((s64)total < SCHED_LOAD_SCALE))
2274 total = SCHED_LOAD_SCALE; 2287 total = SCHED_LOAD_SCALE;
@@ -2378,7 +2391,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2378 int local_group, const struct cpumask *cpus, 2391 int local_group, const struct cpumask *cpus,
2379 int *balance, struct sg_lb_stats *sgs) 2392 int *balance, struct sg_lb_stats *sgs)
2380{ 2393{
2381 unsigned long load, max_cpu_load, min_cpu_load; 2394 unsigned long load, max_cpu_load, min_cpu_load, max_nr_running;
2382 int i; 2395 int i;
2383 unsigned int balance_cpu = -1, first_idle_cpu = 0; 2396 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2384 unsigned long avg_load_per_task = 0; 2397 unsigned long avg_load_per_task = 0;
@@ -2389,6 +2402,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2389 /* Tally up the load of all CPUs in the group */ 2402 /* Tally up the load of all CPUs in the group */
2390 max_cpu_load = 0; 2403 max_cpu_load = 0;
2391 min_cpu_load = ~0UL; 2404 min_cpu_load = ~0UL;
2405 max_nr_running = 0;
2392 2406
2393 for_each_cpu_and(i, sched_group_cpus(group), cpus) { 2407 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
2394 struct rq *rq = cpu_rq(i); 2408 struct rq *rq = cpu_rq(i);
@@ -2406,8 +2420,10 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2406 load = target_load(i, load_idx); 2420 load = target_load(i, load_idx);
2407 } else { 2421 } else {
2408 load = source_load(i, load_idx); 2422 load = source_load(i, load_idx);
2409 if (load > max_cpu_load) 2423 if (load > max_cpu_load) {
2410 max_cpu_load = load; 2424 max_cpu_load = load;
2425 max_nr_running = rq->nr_running;
2426 }
2411 if (min_cpu_load > load) 2427 if (min_cpu_load > load)
2412 min_cpu_load = load; 2428 min_cpu_load = load;
2413 } 2429 }
@@ -2447,13 +2463,15 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2447 if (sgs->sum_nr_running) 2463 if (sgs->sum_nr_running)
2448 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; 2464 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
2449 2465
2450 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) 2466 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1)
2451 sgs->group_imb = 1; 2467 sgs->group_imb = 1;
2452 2468
2453 sgs->group_capacity = 2469 sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
2454 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
2455 if (!sgs->group_capacity) 2470 if (!sgs->group_capacity)
2456 sgs->group_capacity = fix_small_capacity(sd, group); 2471 sgs->group_capacity = fix_small_capacity(sd, group);
2472
2473 if (sgs->group_capacity > sgs->sum_nr_running)
2474 sgs->group_has_capacity = 1;
2457} 2475}
2458 2476
2459/** 2477/**
@@ -2542,9 +2560,14 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2542 /* 2560 /*
2543 * In case the child domain prefers tasks go to siblings 2561 * In case the child domain prefers tasks go to siblings
2544 * first, lower the sg capacity to one so that we'll try 2562 * first, lower the sg capacity to one so that we'll try
2545 * and move all the excess tasks away. 2563 * and move all the excess tasks away. We lower the capacity
2564 * of a group only if the local group has the capacity to fit
2565 * these excess tasks, i.e. nr_running < group_capacity. The
2566 * extra check prevents the case where you always pull from the
2567 * heaviest group when it is already under-utilized (possible
2568 * with a large weight task outweighs the tasks on the system).
2546 */ 2569 */
2547 if (prefer_sibling) 2570 if (prefer_sibling && !local_group && sds->this_has_capacity)
2548 sgs.group_capacity = min(sgs.group_capacity, 1UL); 2571 sgs.group_capacity = min(sgs.group_capacity, 1UL);
2549 2572
2550 if (local_group) { 2573 if (local_group) {
@@ -2552,12 +2575,14 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2552 sds->this = sg; 2575 sds->this = sg;
2553 sds->this_nr_running = sgs.sum_nr_running; 2576 sds->this_nr_running = sgs.sum_nr_running;
2554 sds->this_load_per_task = sgs.sum_weighted_load; 2577 sds->this_load_per_task = sgs.sum_weighted_load;
2578 sds->this_has_capacity = sgs.group_has_capacity;
2555 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { 2579 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
2556 sds->max_load = sgs.avg_load; 2580 sds->max_load = sgs.avg_load;
2557 sds->busiest = sg; 2581 sds->busiest = sg;
2558 sds->busiest_nr_running = sgs.sum_nr_running; 2582 sds->busiest_nr_running = sgs.sum_nr_running;
2559 sds->busiest_group_capacity = sgs.group_capacity; 2583 sds->busiest_group_capacity = sgs.group_capacity;
2560 sds->busiest_load_per_task = sgs.sum_weighted_load; 2584 sds->busiest_load_per_task = sgs.sum_weighted_load;
2585 sds->busiest_has_capacity = sgs.group_has_capacity;
2561 sds->group_imb = sgs.group_imb; 2586 sds->group_imb = sgs.group_imb;
2562 } 2587 }
2563 2588
@@ -2754,6 +2779,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
2754 return fix_small_imbalance(sds, this_cpu, imbalance); 2779 return fix_small_imbalance(sds, this_cpu, imbalance);
2755 2780
2756} 2781}
2782
2757/******* find_busiest_group() helpers end here *********************/ 2783/******* find_busiest_group() helpers end here *********************/
2758 2784
2759/** 2785/**
@@ -2805,6 +2831,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2805 * 4) This group is more busy than the avg busieness at this 2831 * 4) This group is more busy than the avg busieness at this
2806 * sched_domain. 2832 * sched_domain.
2807 * 5) The imbalance is within the specified limit. 2833 * 5) The imbalance is within the specified limit.
2834 *
2835 * Note: when doing newidle balance, if the local group has excess
2836 * capacity (i.e. nr_running < group_capacity) and the busiest group
2837 * does not have any capacity, we force a load balance to pull tasks
2838 * to the local group. In this case, we skip past checks 3, 4 and 5.
2808 */ 2839 */
2809 if (!(*balance)) 2840 if (!(*balance))
2810 goto ret; 2841 goto ret;
@@ -2816,6 +2847,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2816 if (!sds.busiest || sds.busiest_nr_running == 0) 2847 if (!sds.busiest || sds.busiest_nr_running == 0)
2817 goto out_balanced; 2848 goto out_balanced;
2818 2849
2850 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
2851 if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
2852 !sds.busiest_has_capacity)
2853 goto force_balance;
2854
2819 if (sds.this_load >= sds.max_load) 2855 if (sds.this_load >= sds.max_load)
2820 goto out_balanced; 2856 goto out_balanced;
2821 2857
@@ -2827,6 +2863,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2827 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) 2863 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
2828 goto out_balanced; 2864 goto out_balanced;
2829 2865
2866force_balance:
2830 /* Looks like there is an imbalance. Compute it */ 2867 /* Looks like there is an imbalance. Compute it */
2831 calculate_imbalance(&sds, this_cpu, imbalance); 2868 calculate_imbalance(&sds, this_cpu, imbalance);
2832 return sds.busiest; 2869 return sds.busiest;
@@ -3031,7 +3068,14 @@ redo:
3031 3068
3032 if (!ld_moved) { 3069 if (!ld_moved) {
3033 schedstat_inc(sd, lb_failed[idle]); 3070 schedstat_inc(sd, lb_failed[idle]);
3034 sd->nr_balance_failed++; 3071 /*
3072 * Increment the failure counter only on periodic balance.
3073 * We do not want newidle balance, which can be very
3074 * frequent, pollute the failure counter causing
3075 * excessive cache_hot migrations and active balances.
3076 */
3077 if (idle != CPU_NEWLY_IDLE)
3078 sd->nr_balance_failed++;
3035 3079
3036 if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest), 3080 if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),
3037 this_cpu)) { 3081 this_cpu)) {
@@ -3153,10 +3197,8 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3153 interval = msecs_to_jiffies(sd->balance_interval); 3197 interval = msecs_to_jiffies(sd->balance_interval);
3154 if (time_after(next_balance, sd->last_balance + interval)) 3198 if (time_after(next_balance, sd->last_balance + interval))
3155 next_balance = sd->last_balance + interval; 3199 next_balance = sd->last_balance + interval;
3156 if (pulled_task) { 3200 if (pulled_task)
3157 this_rq->idle_stamp = 0;
3158 break; 3201 break;
3159 }
3160 } 3202 }
3161 3203
3162 raw_spin_lock(&this_rq->lock); 3204 raw_spin_lock(&this_rq->lock);
@@ -3751,8 +3793,11 @@ static void task_fork_fair(struct task_struct *p)
3751 3793
3752 update_rq_clock(rq); 3794 update_rq_clock(rq);
3753 3795
3754 if (unlikely(task_cpu(p) != this_cpu)) 3796 if (unlikely(task_cpu(p) != this_cpu)) {
3797 rcu_read_lock();
3755 __set_task_cpu(p, this_cpu); 3798 __set_task_cpu(p, this_cpu);
3799 rcu_read_unlock();
3800 }
3756 3801
3757 update_curr(cfs_rq); 3802 update_curr(cfs_rq);
3758 3803
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 83c66e8ad3ee..185f920ec1a2 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -61,3 +61,8 @@ SCHED_FEAT(ASYM_EFF_LOAD, 1)
61 * release the lock. Decreases scheduling overhead. 61 * release the lock. Decreases scheduling overhead.
62 */ 62 */
63SCHED_FEAT(OWNER_SPIN, 1) 63SCHED_FEAT(OWNER_SPIN, 1)
64
65/*
66 * Decrement CPU power based on irq activity
67 */
68SCHED_FEAT(NONIRQ_POWER, 1)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index d10c80ebb67a..bea7d79f7e9c 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -609,7 +609,7 @@ static void update_curr_rt(struct rq *rq)
609 if (!task_has_rt_policy(curr)) 609 if (!task_has_rt_policy(curr))
610 return; 610 return;
611 611
612 delta_exec = rq->clock - curr->se.exec_start; 612 delta_exec = rq->clock_task - curr->se.exec_start;
613 if (unlikely((s64)delta_exec < 0)) 613 if (unlikely((s64)delta_exec < 0))
614 delta_exec = 0; 614 delta_exec = 0;
615 615
@@ -618,7 +618,7 @@ static void update_curr_rt(struct rq *rq)
618 curr->se.sum_exec_runtime += delta_exec; 618 curr->se.sum_exec_runtime += delta_exec;
619 account_group_exec_runtime(curr, delta_exec); 619 account_group_exec_runtime(curr, delta_exec);
620 620
621 curr->se.exec_start = rq->clock; 621 curr->se.exec_start = rq->clock_task;
622 cpuacct_charge(curr, delta_exec); 622 cpuacct_charge(curr, delta_exec);
623 623
624 sched_rt_avg_update(rq, delta_exec); 624 sched_rt_avg_update(rq, delta_exec);
@@ -960,18 +960,19 @@ select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
960 * runqueue. Otherwise simply start this RT task 960 * runqueue. Otherwise simply start this RT task
961 * on its current runqueue. 961 * on its current runqueue.
962 * 962 *
963 * We want to avoid overloading runqueues. Even if 963 * We want to avoid overloading runqueues. If the woken
964 * the RT task is of higher priority than the current RT task. 964 * task is a higher priority, then it will stay on this CPU
965 * RT tasks behave differently than other tasks. If 965 * and the lower prio task should be moved to another CPU.
966 * one gets preempted, we try to push it off to another queue. 966 * Even though this will probably make the lower prio task
967 * So trying to keep a preempting RT task on the same 967 * lose its cache, we do not want to bounce a higher task
968 * cache hot CPU will force the running RT task to 968 * around just because it gave up its CPU, perhaps for a
969 * a cold CPU. So we waste all the cache for the lower 969 * lock?
970 * RT task in hopes of saving some of a RT task 970 *
971 * that is just being woken and probably will have 971 * For equal prio tasks, we just let the scheduler sort it out.
972 * cold cache anyway.
973 */ 972 */
974 if (unlikely(rt_task(rq->curr)) && 973 if (unlikely(rt_task(rq->curr)) &&
974 (rq->curr->rt.nr_cpus_allowed < 2 ||
975 rq->curr->prio < p->prio) &&
975 (p->rt.nr_cpus_allowed > 1)) { 976 (p->rt.nr_cpus_allowed > 1)) {
976 int cpu = find_lowest_rq(p); 977 int cpu = find_lowest_rq(p);
977 978
@@ -1074,7 +1075,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
1074 } while (rt_rq); 1075 } while (rt_rq);
1075 1076
1076 p = rt_task_of(rt_se); 1077 p = rt_task_of(rt_se);
1077 p->se.exec_start = rq->clock; 1078 p->se.exec_start = rq->clock_task;
1078 1079
1079 return p; 1080 return p;
1080} 1081}
@@ -1139,7 +1140,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
1139 for_each_leaf_rt_rq(rt_rq, rq) { 1140 for_each_leaf_rt_rq(rt_rq, rq) {
1140 array = &rt_rq->active; 1141 array = &rt_rq->active;
1141 idx = sched_find_first_bit(array->bitmap); 1142 idx = sched_find_first_bit(array->bitmap);
1142 next_idx: 1143next_idx:
1143 if (idx >= MAX_RT_PRIO) 1144 if (idx >= MAX_RT_PRIO)
1144 continue; 1145 continue;
1145 if (next && next->prio < idx) 1146 if (next && next->prio < idx)
@@ -1315,7 +1316,7 @@ static int push_rt_task(struct rq *rq)
1315 if (!next_task) 1316 if (!next_task)
1316 return 0; 1317 return 0;
1317 1318
1318 retry: 1319retry:
1319 if (unlikely(next_task == rq->curr)) { 1320 if (unlikely(next_task == rq->curr)) {
1320 WARN_ON(1); 1321 WARN_ON(1);
1321 return 0; 1322 return 0;
@@ -1463,7 +1464,7 @@ static int pull_rt_task(struct rq *this_rq)
1463 * but possible) 1464 * but possible)
1464 */ 1465 */
1465 } 1466 }
1466 skip: 1467skip:
1467 double_unlock_balance(this_rq, src_rq); 1468 double_unlock_balance(this_rq, src_rq);
1468 } 1469 }
1469 1470
@@ -1491,7 +1492,10 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
1491 if (!task_running(rq, p) && 1492 if (!task_running(rq, p) &&
1492 !test_tsk_need_resched(rq->curr) && 1493 !test_tsk_need_resched(rq->curr) &&
1493 has_pushable_tasks(rq) && 1494 has_pushable_tasks(rq) &&
1494 p->rt.nr_cpus_allowed > 1) 1495 p->rt.nr_cpus_allowed > 1 &&
1496 rt_task(rq->curr) &&
1497 (rq->curr->rt.nr_cpus_allowed < 2 ||
1498 rq->curr->prio < p->prio))
1495 push_rt_tasks(rq); 1499 push_rt_tasks(rq);
1496} 1500}
1497 1501
@@ -1709,7 +1713,7 @@ static void set_curr_task_rt(struct rq *rq)
1709{ 1713{
1710 struct task_struct *p = rq->curr; 1714 struct task_struct *p = rq->curr;
1711 1715
1712 p->se.exec_start = rq->clock; 1716 p->se.exec_start = rq->clock_task;
1713 1717
1714 /* The running task is never eligible for pushing */ 1718 /* The running task is never eligible for pushing */
1715 dequeue_pushable_task(rq, p); 1719 dequeue_pushable_task(rq, p);
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
new file mode 100644
index 000000000000..45bddc0c1048
--- /dev/null
+++ b/kernel/sched_stoptask.c
@@ -0,0 +1,108 @@
1/*
2 * stop-task scheduling class.
3 *
4 * The stop task is the highest priority task in the system, it preempts
5 * everything and will be preempted by nothing.
6 *
7 * See kernel/stop_machine.c
8 */
9
10#ifdef CONFIG_SMP
11static int
12select_task_rq_stop(struct rq *rq, struct task_struct *p,
13 int sd_flag, int flags)
14{
15 return task_cpu(p); /* stop tasks as never migrate */
16}
17#endif /* CONFIG_SMP */
18
19static void
20check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
21{
22 resched_task(rq->curr); /* we preempt everything */
23}
24
25static struct task_struct *pick_next_task_stop(struct rq *rq)
26{
27 struct task_struct *stop = rq->stop;
28
29 if (stop && stop->state == TASK_RUNNING)
30 return stop;
31
32 return NULL;
33}
34
35static void
36enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
37{
38}
39
40static void
41dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
42{
43}
44
45static void yield_task_stop(struct rq *rq)
46{
47 BUG(); /* the stop task should never yield, its pointless. */
48}
49
50static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
51{
52}
53
54static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
55{
56}
57
58static void set_curr_task_stop(struct rq *rq)
59{
60}
61
62static void switched_to_stop(struct rq *rq, struct task_struct *p,
63 int running)
64{
65 BUG(); /* its impossible to change to this class */
66}
67
68static void prio_changed_stop(struct rq *rq, struct task_struct *p,
69 int oldprio, int running)
70{
71 BUG(); /* how!?, what priority? */
72}
73
74static unsigned int
75get_rr_interval_stop(struct rq *rq, struct task_struct *task)
76{
77 return 0;
78}
79
80/*
81 * Simple, special scheduling class for the per-CPU stop tasks:
82 */
83static const struct sched_class stop_sched_class = {
84 .next = &rt_sched_class,
85
86 .enqueue_task = enqueue_task_stop,
87 .dequeue_task = dequeue_task_stop,
88 .yield_task = yield_task_stop,
89
90 .check_preempt_curr = check_preempt_curr_stop,
91
92 .pick_next_task = pick_next_task_stop,
93 .put_prev_task = put_prev_task_stop,
94
95#ifdef CONFIG_SMP
96 .select_task_rq = select_task_rq_stop,
97#endif
98
99 .set_curr_task = set_curr_task_stop,
100 .task_tick = task_tick_stop,
101
102 .get_rr_interval = get_rr_interval_stop,
103
104 .prio_changed = prio_changed_stop,
105 .switched_to = switched_to_stop,
106
107 /* no .task_new for stop tasks */
108};
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 07b4f1b1a73a..fc978889b194 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -77,11 +77,21 @@ void wakeup_softirqd(void)
77} 77}
78 78
79/* 79/*
80 * preempt_count and SOFTIRQ_OFFSET usage:
81 * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
82 * softirq processing.
83 * - preempt_count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET)
84 * on local_bh_disable or local_bh_enable.
85 * This lets us distinguish between whether we are currently processing
86 * softirq and whether we just have bh disabled.
87 */
88
89/*
80 * This one is for softirq.c-internal use, 90 * This one is for softirq.c-internal use,
81 * where hardirqs are disabled legitimately: 91 * where hardirqs are disabled legitimately:
82 */ 92 */
83#ifdef CONFIG_TRACE_IRQFLAGS 93#ifdef CONFIG_TRACE_IRQFLAGS
84static void __local_bh_disable(unsigned long ip) 94static void __local_bh_disable(unsigned long ip, unsigned int cnt)
85{ 95{
86 unsigned long flags; 96 unsigned long flags;
87 97
@@ -95,32 +105,43 @@ static void __local_bh_disable(unsigned long ip)
95 * We must manually increment preempt_count here and manually 105 * We must manually increment preempt_count here and manually
96 * call the trace_preempt_off later. 106 * call the trace_preempt_off later.
97 */ 107 */
98 preempt_count() += SOFTIRQ_OFFSET; 108 preempt_count() += cnt;
99 /* 109 /*
100 * Were softirqs turned off above: 110 * Were softirqs turned off above:
101 */ 111 */
102 if (softirq_count() == SOFTIRQ_OFFSET) 112 if (softirq_count() == cnt)
103 trace_softirqs_off(ip); 113 trace_softirqs_off(ip);
104 raw_local_irq_restore(flags); 114 raw_local_irq_restore(flags);
105 115
106 if (preempt_count() == SOFTIRQ_OFFSET) 116 if (preempt_count() == cnt)
107 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 117 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
108} 118}
109#else /* !CONFIG_TRACE_IRQFLAGS */ 119#else /* !CONFIG_TRACE_IRQFLAGS */
110static inline void __local_bh_disable(unsigned long ip) 120static inline void __local_bh_disable(unsigned long ip, unsigned int cnt)
111{ 121{
112 add_preempt_count(SOFTIRQ_OFFSET); 122 add_preempt_count(cnt);
113 barrier(); 123 barrier();
114} 124}
115#endif /* CONFIG_TRACE_IRQFLAGS */ 125#endif /* CONFIG_TRACE_IRQFLAGS */
116 126
117void local_bh_disable(void) 127void local_bh_disable(void)
118{ 128{
119 __local_bh_disable((unsigned long)__builtin_return_address(0)); 129 __local_bh_disable((unsigned long)__builtin_return_address(0),
130 SOFTIRQ_DISABLE_OFFSET);
120} 131}
121 132
122EXPORT_SYMBOL(local_bh_disable); 133EXPORT_SYMBOL(local_bh_disable);
123 134
135static void __local_bh_enable(unsigned int cnt)
136{
137 WARN_ON_ONCE(in_irq());
138 WARN_ON_ONCE(!irqs_disabled());
139
140 if (softirq_count() == cnt)
141 trace_softirqs_on((unsigned long)__builtin_return_address(0));
142 sub_preempt_count(cnt);
143}
144
124/* 145/*
125 * Special-case - softirqs can safely be enabled in 146 * Special-case - softirqs can safely be enabled in
126 * cond_resched_softirq(), or by __do_softirq(), 147 * cond_resched_softirq(), or by __do_softirq(),
@@ -128,12 +149,7 @@ EXPORT_SYMBOL(local_bh_disable);
128 */ 149 */
129void _local_bh_enable(void) 150void _local_bh_enable(void)
130{ 151{
131 WARN_ON_ONCE(in_irq()); 152 __local_bh_enable(SOFTIRQ_DISABLE_OFFSET);
132 WARN_ON_ONCE(!irqs_disabled());
133
134 if (softirq_count() == SOFTIRQ_OFFSET)
135 trace_softirqs_on((unsigned long)__builtin_return_address(0));
136 sub_preempt_count(SOFTIRQ_OFFSET);
137} 153}
138 154
139EXPORT_SYMBOL(_local_bh_enable); 155EXPORT_SYMBOL(_local_bh_enable);
@@ -147,13 +163,13 @@ static inline void _local_bh_enable_ip(unsigned long ip)
147 /* 163 /*
148 * Are softirqs going to be turned on now: 164 * Are softirqs going to be turned on now:
149 */ 165 */
150 if (softirq_count() == SOFTIRQ_OFFSET) 166 if (softirq_count() == SOFTIRQ_DISABLE_OFFSET)
151 trace_softirqs_on(ip); 167 trace_softirqs_on(ip);
152 /* 168 /*
153 * Keep preemption disabled until we are done with 169 * Keep preemption disabled until we are done with
154 * softirq processing: 170 * softirq processing:
155 */ 171 */
156 sub_preempt_count(SOFTIRQ_OFFSET - 1); 172 sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1);
157 173
158 if (unlikely(!in_interrupt() && local_softirq_pending())) 174 if (unlikely(!in_interrupt() && local_softirq_pending()))
159 do_softirq(); 175 do_softirq();
@@ -198,7 +214,8 @@ asmlinkage void __do_softirq(void)
198 pending = local_softirq_pending(); 214 pending = local_softirq_pending();
199 account_system_vtime(current); 215 account_system_vtime(current);
200 216
201 __local_bh_disable((unsigned long)__builtin_return_address(0)); 217 __local_bh_disable((unsigned long)__builtin_return_address(0),
218 SOFTIRQ_OFFSET);
202 lockdep_softirq_enter(); 219 lockdep_softirq_enter();
203 220
204 cpu = smp_processor_id(); 221 cpu = smp_processor_id();
@@ -245,7 +262,7 @@ restart:
245 lockdep_softirq_exit(); 262 lockdep_softirq_exit();
246 263
247 account_system_vtime(current); 264 account_system_vtime(current);
248 _local_bh_enable(); 265 __local_bh_enable(SOFTIRQ_OFFSET);
249} 266}
250 267
251#ifndef __ARCH_HAS_DO_SOFTIRQ 268#ifndef __ARCH_HAS_DO_SOFTIRQ
@@ -279,10 +296,16 @@ void irq_enter(void)
279 296
280 rcu_irq_enter(); 297 rcu_irq_enter();
281 if (idle_cpu(cpu) && !in_interrupt()) { 298 if (idle_cpu(cpu) && !in_interrupt()) {
282 __irq_enter(); 299 /*
300 * Prevent raise_softirq from needlessly waking up ksoftirqd
301 * here, as softirq will be serviced on return from interrupt.
302 */
303 local_bh_disable();
283 tick_check_idle(cpu); 304 tick_check_idle(cpu);
284 } else 305 _local_bh_enable();
285 __irq_enter(); 306 }
307
308 __irq_enter();
286} 309}
287 310
288#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED 311#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
@@ -696,6 +719,7 @@ static int run_ksoftirqd(void * __bind_cpu)
696{ 719{
697 set_current_state(TASK_INTERRUPTIBLE); 720 set_current_state(TASK_INTERRUPTIBLE);
698 721
722 current->flags |= PF_KSOFTIRQD;
699 while (!kthread_should_stop()) { 723 while (!kthread_should_stop()) {
700 preempt_disable(); 724 preempt_disable();
701 if (!local_softirq_pending()) { 725 if (!local_softirq_pending()) {
@@ -886,17 +910,14 @@ int __init __weak early_irq_init(void)
886 return 0; 910 return 0;
887} 911}
888 912
913#ifdef CONFIG_GENERIC_HARDIRQS
889int __init __weak arch_probe_nr_irqs(void) 914int __init __weak arch_probe_nr_irqs(void)
890{ 915{
891 return 0; 916 return NR_IRQS_LEGACY;
892} 917}
893 918
894int __init __weak arch_early_irq_init(void) 919int __init __weak arch_early_irq_init(void)
895{ 920{
896 return 0; 921 return 0;
897} 922}
898 923#endif
899int __weak arch_init_chip_data(struct irq_desc *desc, int node)
900{
901 return 0;
902}
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 2980da3fd509..c71e07500536 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -46,11 +46,9 @@ static int init_srcu_struct_fields(struct srcu_struct *sp)
46int __init_srcu_struct(struct srcu_struct *sp, const char *name, 46int __init_srcu_struct(struct srcu_struct *sp, const char *name,
47 struct lock_class_key *key) 47 struct lock_class_key *key)
48{ 48{
49#ifdef CONFIG_DEBUG_LOCK_ALLOC
50 /* Don't re-initialize a lock while it is held. */ 49 /* Don't re-initialize a lock while it is held. */
51 debug_check_no_locks_freed((void *)sp, sizeof(*sp)); 50 debug_check_no_locks_freed((void *)sp, sizeof(*sp));
52 lockdep_init_map(&sp->dep_map, name, key, 0); 51 lockdep_init_map(&sp->dep_map, name, key, 0);
53#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
54 return init_srcu_struct_fields(sp); 52 return init_srcu_struct_fields(sp);
55} 53}
56EXPORT_SYMBOL_GPL(__init_srcu_struct); 54EXPORT_SYMBOL_GPL(__init_srcu_struct);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 4372ccb25127..090c28812ce1 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -287,11 +287,12 @@ repeat:
287 goto repeat; 287 goto repeat;
288} 288}
289 289
290extern void sched_set_stop_task(int cpu, struct task_struct *stop);
291
290/* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */ 292/* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */
291static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, 293static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
292 unsigned long action, void *hcpu) 294 unsigned long action, void *hcpu)
293{ 295{
294 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
295 unsigned int cpu = (unsigned long)hcpu; 296 unsigned int cpu = (unsigned long)hcpu;
296 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); 297 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
297 struct task_struct *p; 298 struct task_struct *p;
@@ -304,13 +305,13 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
304 cpu); 305 cpu);
305 if (IS_ERR(p)) 306 if (IS_ERR(p))
306 return NOTIFY_BAD; 307 return NOTIFY_BAD;
307 sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
308 get_task_struct(p); 308 get_task_struct(p);
309 kthread_bind(p, cpu);
310 sched_set_stop_task(cpu, p);
309 stopper->thread = p; 311 stopper->thread = p;
310 break; 312 break;
311 313
312 case CPU_ONLINE: 314 case CPU_ONLINE:
313 kthread_bind(stopper->thread, cpu);
314 /* strictly unnecessary, as first user will wake it */ 315 /* strictly unnecessary, as first user will wake it */
315 wake_up_process(stopper->thread); 316 wake_up_process(stopper->thread);
316 /* mark enabled */ 317 /* mark enabled */
@@ -325,6 +326,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
325 { 326 {
326 struct cpu_stop_work *work; 327 struct cpu_stop_work *work;
327 328
329 sched_set_stop_task(cpu, NULL);
328 /* kill the stopper */ 330 /* kill the stopper */
329 kthread_stop(stopper->thread); 331 kthread_stop(stopper->thread);
330 /* drain remaining works */ 332 /* drain remaining works */
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index c63116863a80..d2321891538f 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -149,10 +149,18 @@ static void ntp_update_offset(long offset)
149 time_reftime = get_seconds(); 149 time_reftime = get_seconds();
150 150
151 offset64 = offset; 151 offset64 = offset;
152 freq_adj = (offset64 * secs) << 152 freq_adj = ntp_update_offset_fll(offset64, secs);
153 (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant));
154 153
155 freq_adj += ntp_update_offset_fll(offset64, secs); 154 /*
155 * Clamp update interval to reduce PLL gain with low
156 * sampling rate (e.g. intermittent network connection)
157 * to avoid instability.
158 */
159 if (unlikely(secs > 1 << (SHIFT_PLL + 1 + time_constant)))
160 secs = 1 << (SHIFT_PLL + 1 + time_constant);
161
162 freq_adj += (offset64 * secs) <<
163 (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant));
156 164
157 freq_adj = min(freq_adj + time_freq, MAXFREQ_SCALED); 165 freq_adj = min(freq_adj + time_freq, MAXFREQ_SCALED);
158 166
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 65fb077ea79c..ebd80d50c474 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1638,8 +1638,8 @@ ftrace_failures_open(struct inode *inode, struct file *file)
1638 1638
1639 ret = ftrace_avail_open(inode, file); 1639 ret = ftrace_avail_open(inode, file);
1640 if (!ret) { 1640 if (!ret) {
1641 m = (struct seq_file *)file->private_data; 1641 m = file->private_data;
1642 iter = (struct ftrace_iterator *)m->private; 1642 iter = m->private;
1643 iter->flags = FTRACE_ITER_FAILURES; 1643 iter->flags = FTRACE_ITER_FAILURES;
1644 } 1644 }
1645 1645
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9ec59f541156..001bcd2ccf4a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2196,7 +2196,7 @@ int tracing_open_generic(struct inode *inode, struct file *filp)
2196 2196
2197static int tracing_release(struct inode *inode, struct file *file) 2197static int tracing_release(struct inode *inode, struct file *file)
2198{ 2198{
2199 struct seq_file *m = (struct seq_file *)file->private_data; 2199 struct seq_file *m = file->private_data;
2200 struct trace_iterator *iter; 2200 struct trace_iterator *iter;
2201 int cpu; 2201 int cpu;
2202 2202
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index d39b3c5454a5..9021f8c0c0c3 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -343,6 +343,10 @@ void trace_function(struct trace_array *tr,
343 unsigned long ip, 343 unsigned long ip,
344 unsigned long parent_ip, 344 unsigned long parent_ip,
345 unsigned long flags, int pc); 345 unsigned long flags, int pc);
346void trace_graph_function(struct trace_array *tr,
347 unsigned long ip,
348 unsigned long parent_ip,
349 unsigned long flags, int pc);
346void trace_default_header(struct seq_file *m); 350void trace_default_header(struct seq_file *m);
347void print_trace_header(struct seq_file *m, struct trace_iterator *iter); 351void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
348int trace_empty(struct trace_iterator *iter); 352int trace_empty(struct trace_iterator *iter);
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index ef49e9370b25..76b05980225c 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -262,6 +262,34 @@ int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
262 return trace_graph_entry(trace); 262 return trace_graph_entry(trace);
263} 263}
264 264
265static void
266__trace_graph_function(struct trace_array *tr,
267 unsigned long ip, unsigned long flags, int pc)
268{
269 u64 time = trace_clock_local();
270 struct ftrace_graph_ent ent = {
271 .func = ip,
272 .depth = 0,
273 };
274 struct ftrace_graph_ret ret = {
275 .func = ip,
276 .depth = 0,
277 .calltime = time,
278 .rettime = time,
279 };
280
281 __trace_graph_entry(tr, &ent, flags, pc);
282 __trace_graph_return(tr, &ret, flags, pc);
283}
284
285void
286trace_graph_function(struct trace_array *tr,
287 unsigned long ip, unsigned long parent_ip,
288 unsigned long flags, int pc)
289{
290 __trace_graph_function(tr, ip, flags, pc);
291}
292
265void __trace_graph_return(struct trace_array *tr, 293void __trace_graph_return(struct trace_array *tr,
266 struct ftrace_graph_ret *trace, 294 struct ftrace_graph_ret *trace,
267 unsigned long flags, 295 unsigned long flags,
@@ -888,12 +916,20 @@ check_irq_entry(struct trace_iterator *iter, u32 flags,
888 unsigned long addr, int depth) 916 unsigned long addr, int depth)
889{ 917{
890 int cpu = iter->cpu; 918 int cpu = iter->cpu;
919 int *depth_irq;
891 struct fgraph_data *data = iter->private; 920 struct fgraph_data *data = iter->private;
892 int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
893 921
894 if (flags & TRACE_GRAPH_PRINT_IRQS) 922 /*
923 * If we are either displaying irqs, or we got called as
924 * a graph event and private data does not exist,
925 * then we bypass the irq check.
926 */
927 if ((flags & TRACE_GRAPH_PRINT_IRQS) ||
928 (!data))
895 return 0; 929 return 0;
896 930
931 depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
932
897 /* 933 /*
898 * We are inside the irq code 934 * We are inside the irq code
899 */ 935 */
@@ -926,12 +962,20 @@ static int
926check_irq_return(struct trace_iterator *iter, u32 flags, int depth) 962check_irq_return(struct trace_iterator *iter, u32 flags, int depth)
927{ 963{
928 int cpu = iter->cpu; 964 int cpu = iter->cpu;
965 int *depth_irq;
929 struct fgraph_data *data = iter->private; 966 struct fgraph_data *data = iter->private;
930 int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
931 967
932 if (flags & TRACE_GRAPH_PRINT_IRQS) 968 /*
969 * If we are either displaying irqs, or we got called as
970 * a graph event and private data does not exist,
971 * then we bypass the irq check.
972 */
973 if ((flags & TRACE_GRAPH_PRINT_IRQS) ||
974 (!data))
933 return 0; 975 return 0;
934 976
977 depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq);
978
935 /* 979 /*
936 * We are not inside the irq code. 980 * We are not inside the irq code.
937 */ 981 */
@@ -1163,7 +1207,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
1163 1207
1164 1208
1165enum print_line_t 1209enum print_line_t
1166print_graph_function_flags(struct trace_iterator *iter, u32 flags) 1210__print_graph_function_flags(struct trace_iterator *iter, u32 flags)
1167{ 1211{
1168 struct ftrace_graph_ent_entry *field; 1212 struct ftrace_graph_ent_entry *field;
1169 struct fgraph_data *data = iter->private; 1213 struct fgraph_data *data = iter->private;
@@ -1226,7 +1270,18 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)
1226static enum print_line_t 1270static enum print_line_t
1227print_graph_function(struct trace_iterator *iter) 1271print_graph_function(struct trace_iterator *iter)
1228{ 1272{
1229 return print_graph_function_flags(iter, tracer_flags.val); 1273 return __print_graph_function_flags(iter, tracer_flags.val);
1274}
1275
1276enum print_line_t print_graph_function_flags(struct trace_iterator *iter,
1277 u32 flags)
1278{
1279 if (trace_flags & TRACE_ITER_LATENCY_FMT)
1280 flags |= TRACE_GRAPH_PRINT_DURATION;
1281 else
1282 flags |= TRACE_GRAPH_PRINT_ABS_TIME;
1283
1284 return __print_graph_function_flags(iter, flags);
1230} 1285}
1231 1286
1232static enum print_line_t 1287static enum print_line_t
@@ -1258,7 +1313,7 @@ static void print_lat_header(struct seq_file *s, u32 flags)
1258 seq_printf(s, "#%.*s|||| / \n", size, spaces); 1313 seq_printf(s, "#%.*s|||| / \n", size, spaces);
1259} 1314}
1260 1315
1261void print_graph_headers_flags(struct seq_file *s, u32 flags) 1316static void __print_graph_headers_flags(struct seq_file *s, u32 flags)
1262{ 1317{
1263 int lat = trace_flags & TRACE_ITER_LATENCY_FMT; 1318 int lat = trace_flags & TRACE_ITER_LATENCY_FMT;
1264 1319
@@ -1299,6 +1354,23 @@ void print_graph_headers(struct seq_file *s)
1299 print_graph_headers_flags(s, tracer_flags.val); 1354 print_graph_headers_flags(s, tracer_flags.val);
1300} 1355}
1301 1356
1357void print_graph_headers_flags(struct seq_file *s, u32 flags)
1358{
1359 struct trace_iterator *iter = s->private;
1360
1361 if (trace_flags & TRACE_ITER_LATENCY_FMT) {
1362 /* print nothing if the buffers are empty */
1363 if (trace_empty(iter))
1364 return;
1365
1366 print_trace_header(s, iter);
1367 flags |= TRACE_GRAPH_PRINT_DURATION;
1368 } else
1369 flags |= TRACE_GRAPH_PRINT_ABS_TIME;
1370
1371 __print_graph_headers_flags(s, flags);
1372}
1373
1302void graph_trace_open(struct trace_iterator *iter) 1374void graph_trace_open(struct trace_iterator *iter)
1303{ 1375{
1304 /* pid and depth on the last trace processed */ 1376 /* pid and depth on the last trace processed */
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 73a6b0601f2e..5cf8c602b880 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -87,14 +87,22 @@ static __cacheline_aligned_in_smp unsigned long max_sequence;
87 87
88#ifdef CONFIG_FUNCTION_TRACER 88#ifdef CONFIG_FUNCTION_TRACER
89/* 89/*
90 * irqsoff uses its own tracer function to keep the overhead down: 90 * Prologue for the preempt and irqs off function tracers.
91 *
92 * Returns 1 if it is OK to continue, and data->disabled is
93 * incremented.
94 * 0 if the trace is to be ignored, and data->disabled
95 * is kept the same.
96 *
97 * Note, this function is also used outside this ifdef but
98 * inside the #ifdef of the function graph tracer below.
99 * This is OK, since the function graph tracer is
100 * dependent on the function tracer.
91 */ 101 */
92static void 102static int func_prolog_dec(struct trace_array *tr,
93irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) 103 struct trace_array_cpu **data,
104 unsigned long *flags)
94{ 105{
95 struct trace_array *tr = irqsoff_trace;
96 struct trace_array_cpu *data;
97 unsigned long flags;
98 long disabled; 106 long disabled;
99 int cpu; 107 int cpu;
100 108
@@ -106,18 +114,38 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
106 */ 114 */
107 cpu = raw_smp_processor_id(); 115 cpu = raw_smp_processor_id();
108 if (likely(!per_cpu(tracing_cpu, cpu))) 116 if (likely(!per_cpu(tracing_cpu, cpu)))
109 return; 117 return 0;
110 118
111 local_save_flags(flags); 119 local_save_flags(*flags);
112 /* slight chance to get a false positive on tracing_cpu */ 120 /* slight chance to get a false positive on tracing_cpu */
113 if (!irqs_disabled_flags(flags)) 121 if (!irqs_disabled_flags(*flags))
114 return; 122 return 0;
115 123
116 data = tr->data[cpu]; 124 *data = tr->data[cpu];
117 disabled = atomic_inc_return(&data->disabled); 125 disabled = atomic_inc_return(&(*data)->disabled);
118 126
119 if (likely(disabled == 1)) 127 if (likely(disabled == 1))
120 trace_function(tr, ip, parent_ip, flags, preempt_count()); 128 return 1;
129
130 atomic_dec(&(*data)->disabled);
131
132 return 0;
133}
134
135/*
136 * irqsoff uses its own tracer function to keep the overhead down:
137 */
138static void
139irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
140{
141 struct trace_array *tr = irqsoff_trace;
142 struct trace_array_cpu *data;
143 unsigned long flags;
144
145 if (!func_prolog_dec(tr, &data, &flags))
146 return;
147
148 trace_function(tr, ip, parent_ip, flags, preempt_count());
121 149
122 atomic_dec(&data->disabled); 150 atomic_dec(&data->disabled);
123} 151}
@@ -155,30 +183,16 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
155 struct trace_array *tr = irqsoff_trace; 183 struct trace_array *tr = irqsoff_trace;
156 struct trace_array_cpu *data; 184 struct trace_array_cpu *data;
157 unsigned long flags; 185 unsigned long flags;
158 long disabled;
159 int ret; 186 int ret;
160 int cpu;
161 int pc; 187 int pc;
162 188
163 cpu = raw_smp_processor_id(); 189 if (!func_prolog_dec(tr, &data, &flags))
164 if (likely(!per_cpu(tracing_cpu, cpu)))
165 return 0; 190 return 0;
166 191
167 local_save_flags(flags); 192 pc = preempt_count();
168 /* slight chance to get a false positive on tracing_cpu */ 193 ret = __trace_graph_entry(tr, trace, flags, pc);
169 if (!irqs_disabled_flags(flags))
170 return 0;
171
172 data = tr->data[cpu];
173 disabled = atomic_inc_return(&data->disabled);
174
175 if (likely(disabled == 1)) {
176 pc = preempt_count();
177 ret = __trace_graph_entry(tr, trace, flags, pc);
178 } else
179 ret = 0;
180
181 atomic_dec(&data->disabled); 194 atomic_dec(&data->disabled);
195
182 return ret; 196 return ret;
183} 197}
184 198
@@ -187,27 +201,13 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace)
187 struct trace_array *tr = irqsoff_trace; 201 struct trace_array *tr = irqsoff_trace;
188 struct trace_array_cpu *data; 202 struct trace_array_cpu *data;
189 unsigned long flags; 203 unsigned long flags;
190 long disabled;
191 int cpu;
192 int pc; 204 int pc;
193 205
194 cpu = raw_smp_processor_id(); 206 if (!func_prolog_dec(tr, &data, &flags))
195 if (likely(!per_cpu(tracing_cpu, cpu)))
196 return; 207 return;
197 208
198 local_save_flags(flags); 209 pc = preempt_count();
199 /* slight chance to get a false positive on tracing_cpu */ 210 __trace_graph_return(tr, trace, flags, pc);
200 if (!irqs_disabled_flags(flags))
201 return;
202
203 data = tr->data[cpu];
204 disabled = atomic_inc_return(&data->disabled);
205
206 if (likely(disabled == 1)) {
207 pc = preempt_count();
208 __trace_graph_return(tr, trace, flags, pc);
209 }
210
211 atomic_dec(&data->disabled); 211 atomic_dec(&data->disabled);
212} 212}
213 213
@@ -229,75 +229,33 @@ static void irqsoff_trace_close(struct trace_iterator *iter)
229 229
230static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) 230static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
231{ 231{
232 u32 flags = GRAPH_TRACER_FLAGS;
233
234 if (trace_flags & TRACE_ITER_LATENCY_FMT)
235 flags |= TRACE_GRAPH_PRINT_DURATION;
236 else
237 flags |= TRACE_GRAPH_PRINT_ABS_TIME;
238
239 /* 232 /*
240 * In graph mode call the graph tracer output function, 233 * In graph mode call the graph tracer output function,
241 * otherwise go with the TRACE_FN event handler 234 * otherwise go with the TRACE_FN event handler
242 */ 235 */
243 if (is_graph()) 236 if (is_graph())
244 return print_graph_function_flags(iter, flags); 237 return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS);
245 238
246 return TRACE_TYPE_UNHANDLED; 239 return TRACE_TYPE_UNHANDLED;
247} 240}
248 241
249static void irqsoff_print_header(struct seq_file *s) 242static void irqsoff_print_header(struct seq_file *s)
250{ 243{
251 if (is_graph()) { 244 if (is_graph())
252 struct trace_iterator *iter = s->private; 245 print_graph_headers_flags(s, GRAPH_TRACER_FLAGS);
253 u32 flags = GRAPH_TRACER_FLAGS; 246 else
254
255 if (trace_flags & TRACE_ITER_LATENCY_FMT) {
256 /* print nothing if the buffers are empty */
257 if (trace_empty(iter))
258 return;
259
260 print_trace_header(s, iter);
261 flags |= TRACE_GRAPH_PRINT_DURATION;
262 } else
263 flags |= TRACE_GRAPH_PRINT_ABS_TIME;
264
265 print_graph_headers_flags(s, flags);
266 } else
267 trace_default_header(s); 247 trace_default_header(s);
268} 248}
269 249
270static void 250static void
271trace_graph_function(struct trace_array *tr,
272 unsigned long ip, unsigned long flags, int pc)
273{
274 u64 time = trace_clock_local();
275 struct ftrace_graph_ent ent = {
276 .func = ip,
277 .depth = 0,
278 };
279 struct ftrace_graph_ret ret = {
280 .func = ip,
281 .depth = 0,
282 .calltime = time,
283 .rettime = time,
284 };
285
286 __trace_graph_entry(tr, &ent, flags, pc);
287 __trace_graph_return(tr, &ret, flags, pc);
288}
289
290static void
291__trace_function(struct trace_array *tr, 251__trace_function(struct trace_array *tr,
292 unsigned long ip, unsigned long parent_ip, 252 unsigned long ip, unsigned long parent_ip,
293 unsigned long flags, int pc) 253 unsigned long flags, int pc)
294{ 254{
295 if (!is_graph()) 255 if (is_graph())
256 trace_graph_function(tr, ip, parent_ip, flags, pc);
257 else
296 trace_function(tr, ip, parent_ip, flags, pc); 258 trace_function(tr, ip, parent_ip, flags, pc);
297 else {
298 trace_graph_function(tr, parent_ip, flags, pc);
299 trace_graph_function(tr, ip, flags, pc);
300 }
301} 259}
302 260
303#else 261#else
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 4086eae6e81b..7319559ed59f 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -31,48 +31,98 @@ static int wakeup_rt;
31static arch_spinlock_t wakeup_lock = 31static arch_spinlock_t wakeup_lock =
32 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; 32 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
33 33
34static void wakeup_reset(struct trace_array *tr);
34static void __wakeup_reset(struct trace_array *tr); 35static void __wakeup_reset(struct trace_array *tr);
36static int wakeup_graph_entry(struct ftrace_graph_ent *trace);
37static void wakeup_graph_return(struct ftrace_graph_ret *trace);
35 38
36static int save_lat_flag; 39static int save_lat_flag;
37 40
41#define TRACE_DISPLAY_GRAPH 1
42
43static struct tracer_opt trace_opts[] = {
44#ifdef CONFIG_FUNCTION_GRAPH_TRACER
45 /* display latency trace as call graph */
46 { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) },
47#endif
48 { } /* Empty entry */
49};
50
51static struct tracer_flags tracer_flags = {
52 .val = 0,
53 .opts = trace_opts,
54};
55
56#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH)
57
38#ifdef CONFIG_FUNCTION_TRACER 58#ifdef CONFIG_FUNCTION_TRACER
59
39/* 60/*
40 * irqsoff uses its own tracer function to keep the overhead down: 61 * Prologue for the wakeup function tracers.
62 *
63 * Returns 1 if it is OK to continue, and preemption
64 * is disabled and data->disabled is incremented.
65 * 0 if the trace is to be ignored, and preemption
66 * is not disabled and data->disabled is
67 * kept the same.
68 *
69 * Note, this function is also used outside this ifdef but
70 * inside the #ifdef of the function graph tracer below.
71 * This is OK, since the function graph tracer is
72 * dependent on the function tracer.
41 */ 73 */
42static void 74static int
43wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) 75func_prolog_preempt_disable(struct trace_array *tr,
76 struct trace_array_cpu **data,
77 int *pc)
44{ 78{
45 struct trace_array *tr = wakeup_trace;
46 struct trace_array_cpu *data;
47 unsigned long flags;
48 long disabled; 79 long disabled;
49 int cpu; 80 int cpu;
50 int pc;
51 81
52 if (likely(!wakeup_task)) 82 if (likely(!wakeup_task))
53 return; 83 return 0;
54 84
55 pc = preempt_count(); 85 *pc = preempt_count();
56 preempt_disable_notrace(); 86 preempt_disable_notrace();
57 87
58 cpu = raw_smp_processor_id(); 88 cpu = raw_smp_processor_id();
59 if (cpu != wakeup_current_cpu) 89 if (cpu != wakeup_current_cpu)
60 goto out_enable; 90 goto out_enable;
61 91
62 data = tr->data[cpu]; 92 *data = tr->data[cpu];
63 disabled = atomic_inc_return(&data->disabled); 93 disabled = atomic_inc_return(&(*data)->disabled);
64 if (unlikely(disabled != 1)) 94 if (unlikely(disabled != 1))
65 goto out; 95 goto out;
66 96
67 local_irq_save(flags); 97 return 1;
68 98
69 trace_function(tr, ip, parent_ip, flags, pc); 99out:
100 atomic_dec(&(*data)->disabled);
101
102out_enable:
103 preempt_enable_notrace();
104 return 0;
105}
70 106
107/*
108 * wakeup uses its own tracer function to keep the overhead down:
109 */
110static void
111wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
112{
113 struct trace_array *tr = wakeup_trace;
114 struct trace_array_cpu *data;
115 unsigned long flags;
116 int pc;
117
118 if (!func_prolog_preempt_disable(tr, &data, &pc))
119 return;
120
121 local_irq_save(flags);
122 trace_function(tr, ip, parent_ip, flags, pc);
71 local_irq_restore(flags); 123 local_irq_restore(flags);
72 124
73 out:
74 atomic_dec(&data->disabled); 125 atomic_dec(&data->disabled);
75 out_enable:
76 preempt_enable_notrace(); 126 preempt_enable_notrace();
77} 127}
78 128
@@ -82,6 +132,156 @@ static struct ftrace_ops trace_ops __read_mostly =
82}; 132};
83#endif /* CONFIG_FUNCTION_TRACER */ 133#endif /* CONFIG_FUNCTION_TRACER */
84 134
135static int start_func_tracer(int graph)
136{
137 int ret;
138
139 if (!graph)
140 ret = register_ftrace_function(&trace_ops);
141 else
142 ret = register_ftrace_graph(&wakeup_graph_return,
143 &wakeup_graph_entry);
144
145 if (!ret && tracing_is_enabled())
146 tracer_enabled = 1;
147 else
148 tracer_enabled = 0;
149
150 return ret;
151}
152
153static void stop_func_tracer(int graph)
154{
155 tracer_enabled = 0;
156
157 if (!graph)
158 unregister_ftrace_function(&trace_ops);
159 else
160 unregister_ftrace_graph();
161}
162
163#ifdef CONFIG_FUNCTION_GRAPH_TRACER
164static int wakeup_set_flag(u32 old_flags, u32 bit, int set)
165{
166
167 if (!(bit & TRACE_DISPLAY_GRAPH))
168 return -EINVAL;
169
170 if (!(is_graph() ^ set))
171 return 0;
172
173 stop_func_tracer(!set);
174
175 wakeup_reset(wakeup_trace);
176 tracing_max_latency = 0;
177
178 return start_func_tracer(set);
179}
180
181static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
182{
183 struct trace_array *tr = wakeup_trace;
184 struct trace_array_cpu *data;
185 unsigned long flags;
186 int pc, ret = 0;
187
188 if (!func_prolog_preempt_disable(tr, &data, &pc))
189 return 0;
190
191 local_save_flags(flags);
192 ret = __trace_graph_entry(tr, trace, flags, pc);
193 atomic_dec(&data->disabled);
194 preempt_enable_notrace();
195
196 return ret;
197}
198
199static void wakeup_graph_return(struct ftrace_graph_ret *trace)
200{
201 struct trace_array *tr = wakeup_trace;
202 struct trace_array_cpu *data;
203 unsigned long flags;
204 int pc;
205
206 if (!func_prolog_preempt_disable(tr, &data, &pc))
207 return;
208
209 local_save_flags(flags);
210 __trace_graph_return(tr, trace, flags, pc);
211 atomic_dec(&data->disabled);
212
213 preempt_enable_notrace();
214 return;
215}
216
217static void wakeup_trace_open(struct trace_iterator *iter)
218{
219 if (is_graph())
220 graph_trace_open(iter);
221}
222
223static void wakeup_trace_close(struct trace_iterator *iter)
224{
225 if (iter->private)
226 graph_trace_close(iter);
227}
228
229#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC)
230
231static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
232{
233 /*
234 * In graph mode call the graph tracer output function,
235 * otherwise go with the TRACE_FN event handler
236 */
237 if (is_graph())
238 return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS);
239
240 return TRACE_TYPE_UNHANDLED;
241}
242
243static void wakeup_print_header(struct seq_file *s)
244{
245 if (is_graph())
246 print_graph_headers_flags(s, GRAPH_TRACER_FLAGS);
247 else
248 trace_default_header(s);
249}
250
251static void
252__trace_function(struct trace_array *tr,
253 unsigned long ip, unsigned long parent_ip,
254 unsigned long flags, int pc)
255{
256 if (is_graph())
257 trace_graph_function(tr, ip, parent_ip, flags, pc);
258 else
259 trace_function(tr, ip, parent_ip, flags, pc);
260}
261#else
262#define __trace_function trace_function
263
264static int wakeup_set_flag(u32 old_flags, u32 bit, int set)
265{
266 return -EINVAL;
267}
268
269static int wakeup_graph_entry(struct ftrace_graph_ent *trace)
270{
271 return -1;
272}
273
274static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
275{
276 return TRACE_TYPE_UNHANDLED;
277}
278
279static void wakeup_graph_return(struct ftrace_graph_ret *trace) { }
280static void wakeup_print_header(struct seq_file *s) { }
281static void wakeup_trace_open(struct trace_iterator *iter) { }
282static void wakeup_trace_close(struct trace_iterator *iter) { }
283#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
284
85/* 285/*
86 * Should this new latency be reported/recorded? 286 * Should this new latency be reported/recorded?
87 */ 287 */
@@ -152,7 +352,7 @@ probe_wakeup_sched_switch(void *ignore,
152 /* The task we are waiting for is waking up */ 352 /* The task we are waiting for is waking up */
153 data = wakeup_trace->data[wakeup_cpu]; 353 data = wakeup_trace->data[wakeup_cpu];
154 354
155 trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); 355 __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
156 tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); 356 tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
157 357
158 T0 = data->preempt_timestamp; 358 T0 = data->preempt_timestamp;
@@ -252,7 +452,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
252 * is not called by an assembly function (where as schedule is) 452 * is not called by an assembly function (where as schedule is)
253 * it should be safe to use it here. 453 * it should be safe to use it here.
254 */ 454 */
255 trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); 455 __trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
256 456
257out_locked: 457out_locked:
258 arch_spin_unlock(&wakeup_lock); 458 arch_spin_unlock(&wakeup_lock);
@@ -303,12 +503,8 @@ static void start_wakeup_tracer(struct trace_array *tr)
303 */ 503 */
304 smp_wmb(); 504 smp_wmb();
305 505
306 register_ftrace_function(&trace_ops); 506 if (start_func_tracer(is_graph()))
307 507 printk(KERN_ERR "failed to start wakeup tracer\n");
308 if (tracing_is_enabled())
309 tracer_enabled = 1;
310 else
311 tracer_enabled = 0;
312 508
313 return; 509 return;
314fail_deprobe_wake_new: 510fail_deprobe_wake_new:
@@ -320,7 +516,7 @@ fail_deprobe:
320static void stop_wakeup_tracer(struct trace_array *tr) 516static void stop_wakeup_tracer(struct trace_array *tr)
321{ 517{
322 tracer_enabled = 0; 518 tracer_enabled = 0;
323 unregister_ftrace_function(&trace_ops); 519 stop_func_tracer(is_graph());
324 unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL); 520 unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL);
325 unregister_trace_sched_wakeup_new(probe_wakeup, NULL); 521 unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
326 unregister_trace_sched_wakeup(probe_wakeup, NULL); 522 unregister_trace_sched_wakeup(probe_wakeup, NULL);
@@ -379,9 +575,15 @@ static struct tracer wakeup_tracer __read_mostly =
379 .start = wakeup_tracer_start, 575 .start = wakeup_tracer_start,
380 .stop = wakeup_tracer_stop, 576 .stop = wakeup_tracer_stop,
381 .print_max = 1, 577 .print_max = 1,
578 .print_header = wakeup_print_header,
579 .print_line = wakeup_print_line,
580 .flags = &tracer_flags,
581 .set_flag = wakeup_set_flag,
382#ifdef CONFIG_FTRACE_SELFTEST 582#ifdef CONFIG_FTRACE_SELFTEST
383 .selftest = trace_selftest_startup_wakeup, 583 .selftest = trace_selftest_startup_wakeup,
384#endif 584#endif
585 .open = wakeup_trace_open,
586 .close = wakeup_trace_close,
385 .use_max_tr = 1, 587 .use_max_tr = 1,
386}; 588};
387 589
@@ -394,9 +596,15 @@ static struct tracer wakeup_rt_tracer __read_mostly =
394 .stop = wakeup_tracer_stop, 596 .stop = wakeup_tracer_stop,
395 .wait_pipe = poll_wait_pipe, 597 .wait_pipe = poll_wait_pipe,
396 .print_max = 1, 598 .print_max = 1,
599 .print_header = wakeup_print_header,
600 .print_line = wakeup_print_line,
601 .flags = &tracer_flags,
602 .set_flag = wakeup_set_flag,
397#ifdef CONFIG_FTRACE_SELFTEST 603#ifdef CONFIG_FTRACE_SELFTEST
398 .selftest = trace_selftest_startup_wakeup, 604 .selftest = trace_selftest_startup_wakeup,
399#endif 605#endif
606 .open = wakeup_trace_open,
607 .close = wakeup_trace_close,
400 .use_max_tr = 1, 608 .use_max_tr = 1,
401}; 609};
402 610
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index dc8e16824b51..bafba687a6d8 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -196,7 +196,7 @@ static struct perf_event_attr wd_hw_attr = {
196}; 196};
197 197
198/* Callback function for perf event subsystem */ 198/* Callback function for perf event subsystem */
199void watchdog_overflow_callback(struct perf_event *event, int nmi, 199static void watchdog_overflow_callback(struct perf_event *event, int nmi,
200 struct perf_sample_data *data, 200 struct perf_sample_data *data,
201 struct pt_regs *regs) 201 struct pt_regs *regs)
202{ 202{
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index e85d549b6eac..21ac83070a80 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -540,6 +540,23 @@ config PROVE_RCU_REPEATEDLY
540 disabling, allowing multiple RCU-lockdep warnings to be printed 540 disabling, allowing multiple RCU-lockdep warnings to be printed
541 on a single reboot. 541 on a single reboot.
542 542
543 Say Y to allow multiple RCU-lockdep warnings per boot.
544
545 Say N if you are unsure.
546
547config SPARSE_RCU_POINTER
548 bool "RCU debugging: sparse-based checks for pointer usage"
549 default n
550 help
551 This feature enables the __rcu sparse annotation for
552 RCU-protected pointers. This annotation will cause sparse
553 to flag any non-RCU used of annotated pointers. This can be
554 helpful when debugging RCU usage. Please note that this feature
555 is not intended to enforce code cleanliness; it is instead merely
556 a debugging aid.
557
558 Say Y to make sparse flag questionable use of RCU-protected pointers
559
543 Say N if you are unsure. 560 Say N if you are unsure.
544 561
545config LOCKDEP 562config LOCKDEP
@@ -832,6 +849,30 @@ config RCU_CPU_STALL_DETECTOR
832 849
833 Say Y if you are unsure. 850 Say Y if you are unsure.
834 851
852config RCU_CPU_STALL_TIMEOUT
853 int "RCU CPU stall timeout in seconds"
854 depends on RCU_CPU_STALL_DETECTOR
855 range 3 300
856 default 60
857 help
858 If a given RCU grace period extends more than the specified
859 number of seconds, a CPU stall warning is printed. If the
860 RCU grace period persists, additional CPU stall warnings are
861 printed at more widely spaced intervals.
862
863config RCU_CPU_STALL_DETECTOR_RUNNABLE
864 bool "RCU CPU stall checking starts automatically at boot"
865 depends on RCU_CPU_STALL_DETECTOR
866 default y
867 help
868 If set, start checking for RCU CPU stalls immediately on
869 boot. Otherwise, RCU CPU stall checking must be manually
870 enabled.
871
872 Say Y if you are unsure.
873
874 Say N if you wish to suppress RCU CPU stall checking during boot.
875
835config RCU_CPU_STALL_VERBOSE 876config RCU_CPU_STALL_VERBOSE
836 bool "Print additional per-task information for RCU_CPU_STALL_DETECTOR" 877 bool "Print additional per-task information for RCU_CPU_STALL_DETECTOR"
837 depends on RCU_CPU_STALL_DETECTOR && TREE_PREEMPT_RCU 878 depends on RCU_CPU_STALL_DETECTOR && TREE_PREEMPT_RCU
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index efd16fa80b1c..6f412ab4c24f 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -49,7 +49,7 @@ struct radix_tree_node {
49 unsigned int height; /* Height from the bottom */ 49 unsigned int height; /* Height from the bottom */
50 unsigned int count; 50 unsigned int count;
51 struct rcu_head rcu_head; 51 struct rcu_head rcu_head;
52 void *slots[RADIX_TREE_MAP_SIZE]; 52 void __rcu *slots[RADIX_TREE_MAP_SIZE];
53 unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; 53 unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
54}; 54};
55 55
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 34e3082632d8..7c06ee51a29a 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -70,7 +70,7 @@ static unsigned long io_tlb_nslabs;
70 */ 70 */
71static unsigned long io_tlb_overflow = 32*1024; 71static unsigned long io_tlb_overflow = 32*1024;
72 72
73void *io_tlb_overflow_buffer; 73static void *io_tlb_overflow_buffer;
74 74
75/* 75/*
76 * This is a free list describing the number of free entries available from 76 * This is a free list describing the number of free entries available from
@@ -147,16 +147,16 @@ void __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
147 * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE 147 * to find contiguous free memory regions of size up to IO_TLB_SEGSIZE
148 * between io_tlb_start and io_tlb_end. 148 * between io_tlb_start and io_tlb_end.
149 */ 149 */
150 io_tlb_list = alloc_bootmem(io_tlb_nslabs * sizeof(int)); 150 io_tlb_list = alloc_bootmem_pages(PAGE_ALIGN(io_tlb_nslabs * sizeof(int)));
151 for (i = 0; i < io_tlb_nslabs; i++) 151 for (i = 0; i < io_tlb_nslabs; i++)
152 io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE); 152 io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE);
153 io_tlb_index = 0; 153 io_tlb_index = 0;
154 io_tlb_orig_addr = alloc_bootmem(io_tlb_nslabs * sizeof(phys_addr_t)); 154 io_tlb_orig_addr = alloc_bootmem_pages(PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)));
155 155
156 /* 156 /*
157 * Get the overflow emergency buffer 157 * Get the overflow emergency buffer
158 */ 158 */
159 io_tlb_overflow_buffer = alloc_bootmem_low(io_tlb_overflow); 159 io_tlb_overflow_buffer = alloc_bootmem_low_pages(PAGE_ALIGN(io_tlb_overflow));
160 if (!io_tlb_overflow_buffer) 160 if (!io_tlb_overflow_buffer)
161 panic("Cannot allocate SWIOTLB overflow buffer!\n"); 161 panic("Cannot allocate SWIOTLB overflow buffer!\n");
162 if (verbose) 162 if (verbose)
@@ -182,7 +182,7 @@ swiotlb_init_with_default_size(size_t default_size, int verbose)
182 /* 182 /*
183 * Get IO TLB memory from the low pages 183 * Get IO TLB memory from the low pages
184 */ 184 */
185 io_tlb_start = alloc_bootmem_low_pages(bytes); 185 io_tlb_start = alloc_bootmem_low_pages(PAGE_ALIGN(bytes));
186 if (!io_tlb_start) 186 if (!io_tlb_start)
187 panic("Cannot allocate SWIOTLB buffer"); 187 panic("Cannot allocate SWIOTLB buffer");
188 188
@@ -308,13 +308,13 @@ void __init swiotlb_free(void)
308 get_order(io_tlb_nslabs << IO_TLB_SHIFT)); 308 get_order(io_tlb_nslabs << IO_TLB_SHIFT));
309 } else { 309 } else {
310 free_bootmem_late(__pa(io_tlb_overflow_buffer), 310 free_bootmem_late(__pa(io_tlb_overflow_buffer),
311 io_tlb_overflow); 311 PAGE_ALIGN(io_tlb_overflow));
312 free_bootmem_late(__pa(io_tlb_orig_addr), 312 free_bootmem_late(__pa(io_tlb_orig_addr),
313 io_tlb_nslabs * sizeof(phys_addr_t)); 313 PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t)));
314 free_bootmem_late(__pa(io_tlb_list), 314 free_bootmem_late(__pa(io_tlb_list),
315 io_tlb_nslabs * sizeof(int)); 315 PAGE_ALIGN(io_tlb_nslabs * sizeof(int)));
316 free_bootmem_late(__pa(io_tlb_start), 316 free_bootmem_late(__pa(io_tlb_start),
317 io_tlb_nslabs << IO_TLB_SHIFT); 317 PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT));
318 } 318 }
319} 319}
320 320
diff --git a/mm/memory.c b/mm/memory.c
index 0e18b4d649ec..98b58fecedef 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3185,7 +3185,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
3185 * with threads. 3185 * with threads.
3186 */ 3186 */
3187 if (flags & FAULT_FLAG_WRITE) 3187 if (flags & FAULT_FLAG_WRITE)
3188 flush_tlb_page(vma, address); 3188 flush_tlb_fix_spurious_fault(vma, address);
3189 } 3189 }
3190unlock: 3190unlock:
3191 pte_unmap_unlock(pte, ptl); 3191 pte_unmap_unlock(pte, ptl);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 6b8889da69a6..d8087f0db507 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -517,6 +517,15 @@ static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
517static void purge_fragmented_blocks_allcpus(void); 517static void purge_fragmented_blocks_allcpus(void);
518 518
519/* 519/*
520 * called before a call to iounmap() if the caller wants vm_area_struct's
521 * immediately freed.
522 */
523void set_iounmap_nonlazy(void)
524{
525 atomic_set(&vmap_lazy_nr, lazy_max_pages()+1);
526}
527
528/*
520 * Purges all lazily-freed vmap areas. 529 * Purges all lazily-freed vmap areas.
521 * 530 *
522 * If sync is 0 then don't purge if there is already a purge in progress. 531 * If sync is 0 then don't purge if there is already a purge in progress.
diff --git a/net/Kconfig b/net/Kconfig
index e926884c1675..55fd82e9ffd9 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -293,6 +293,7 @@ source "net/wimax/Kconfig"
293source "net/rfkill/Kconfig" 293source "net/rfkill/Kconfig"
294source "net/9p/Kconfig" 294source "net/9p/Kconfig"
295source "net/caif/Kconfig" 295source "net/caif/Kconfig"
296source "net/ceph/Kconfig"
296 297
297 298
298endif # if NET 299endif # if NET
diff --git a/net/Makefile b/net/Makefile
index ea60fbce9b1b..6b7bfd7f1416 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -68,3 +68,4 @@ obj-$(CONFIG_SYSCTL) += sysctl_net.o
68endif 68endif
69obj-$(CONFIG_WIMAX) += wimax/ 69obj-$(CONFIG_WIMAX) += wimax/
70obj-$(CONFIG_DNS_RESOLVER) += dns_resolver/ 70obj-$(CONFIG_DNS_RESOLVER) += dns_resolver/
71obj-$(CONFIG_CEPH_LIB) += ceph/
diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig
new file mode 100644
index 000000000000..ad424049b0cf
--- /dev/null
+++ b/net/ceph/Kconfig
@@ -0,0 +1,28 @@
1config CEPH_LIB
2 tristate "Ceph core library (EXPERIMENTAL)"
3 depends on INET && EXPERIMENTAL
4 select LIBCRC32C
5 select CRYPTO_AES
6 select CRYPTO
7 default n
8 help
9 Choose Y or M here to include cephlib, which provides the
10 common functionality to both the Ceph filesystem and
11 to the rados block device (rbd).
12
13 More information at http://ceph.newdream.net/.
14
15 If unsure, say N.
16
17config CEPH_LIB_PRETTYDEBUG
18 bool "Include file:line in ceph debug output"
19 depends on CEPH_LIB
20 default n
21 help
22 If you say Y here, debug output will include a filename and
23 line to aid debugging. This increases kernel size and slows
24 execution slightly when debug call sites are enabled (e.g.,
25 via CONFIG_DYNAMIC_DEBUG).
26
27 If unsure, say N.
28
diff --git a/net/ceph/Makefile b/net/ceph/Makefile
new file mode 100644
index 000000000000..aab1cabb8035
--- /dev/null
+++ b/net/ceph/Makefile
@@ -0,0 +1,37 @@
1#
2# Makefile for CEPH filesystem.
3#
4
5ifneq ($(KERNELRELEASE),)
6
7obj-$(CONFIG_CEPH_LIB) += libceph.o
8
9libceph-objs := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
10 mon_client.o \
11 osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
12 debugfs.o \
13 auth.o auth_none.o \
14 crypto.o armor.o \
15 auth_x.o \
16 ceph_fs.o ceph_strings.o ceph_hash.o \
17 pagevec.o
18
19else
20#Otherwise we were called directly from the command
21# line; invoke the kernel build system.
22
23KERNELDIR ?= /lib/modules/$(shell uname -r)/build
24PWD := $(shell pwd)
25
26default: all
27
28all:
29 $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_LIB=m modules
30
31modules_install:
32 $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_LIB=m modules_install
33
34clean:
35 $(MAKE) -C $(KERNELDIR) M=$(PWD) clean
36
37endif
diff --git a/fs/ceph/armor.c b/net/ceph/armor.c
index eb2a666b0be7..eb2a666b0be7 100644
--- a/fs/ceph/armor.c
+++ b/net/ceph/armor.c
diff --git a/fs/ceph/auth.c b/net/ceph/auth.c
index 6d2e30600627..549c1f43e1d5 100644
--- a/fs/ceph/auth.c
+++ b/net/ceph/auth.c
@@ -1,16 +1,16 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/module.h> 3#include <linux/module.h>
4#include <linux/err.h> 4#include <linux/err.h>
5#include <linux/slab.h> 5#include <linux/slab.h>
6 6
7#include "types.h" 7#include <linux/ceph/types.h>
8#include <linux/ceph/decode.h>
9#include <linux/ceph/libceph.h>
10#include <linux/ceph/messenger.h>
8#include "auth_none.h" 11#include "auth_none.h"
9#include "auth_x.h" 12#include "auth_x.h"
10#include "decode.h"
11#include "super.h"
12 13
13#include "messenger.h"
14 14
15/* 15/*
16 * get protocol handler 16 * get protocol handler
diff --git a/fs/ceph/auth_none.c b/net/ceph/auth_none.c
index ad1dc21286c7..214c2bb43d62 100644
--- a/fs/ceph/auth_none.c
+++ b/net/ceph/auth_none.c
@@ -1,14 +1,15 @@
1 1
2#include "ceph_debug.h" 2#include <linux/ceph/ceph_debug.h>
3 3
4#include <linux/err.h> 4#include <linux/err.h>
5#include <linux/module.h> 5#include <linux/module.h>
6#include <linux/random.h> 6#include <linux/random.h>
7#include <linux/slab.h> 7#include <linux/slab.h>
8 8
9#include <linux/ceph/decode.h>
10#include <linux/ceph/auth.h>
11
9#include "auth_none.h" 12#include "auth_none.h"
10#include "auth.h"
11#include "decode.h"
12 13
13static void reset(struct ceph_auth_client *ac) 14static void reset(struct ceph_auth_client *ac)
14{ 15{
diff --git a/fs/ceph/auth_none.h b/net/ceph/auth_none.h
index 8164df1a08be..ed7d088b1bc9 100644
--- a/fs/ceph/auth_none.h
+++ b/net/ceph/auth_none.h
@@ -2,8 +2,7 @@
2#define _FS_CEPH_AUTH_NONE_H 2#define _FS_CEPH_AUTH_NONE_H
3 3
4#include <linux/slab.h> 4#include <linux/slab.h>
5 5#include <linux/ceph/auth.h>
6#include "auth.h"
7 6
8/* 7/*
9 * null security mode. 8 * null security mode.
diff --git a/fs/ceph/auth_x.c b/net/ceph/auth_x.c
index a2d002cbdec2..7fd5dfcf6e18 100644
--- a/fs/ceph/auth_x.c
+++ b/net/ceph/auth_x.c
@@ -1,16 +1,17 @@
1 1
2#include "ceph_debug.h" 2#include <linux/ceph/ceph_debug.h>
3 3
4#include <linux/err.h> 4#include <linux/err.h>
5#include <linux/module.h> 5#include <linux/module.h>
6#include <linux/random.h> 6#include <linux/random.h>
7#include <linux/slab.h> 7#include <linux/slab.h>
8 8
9#include <linux/ceph/decode.h>
10#include <linux/ceph/auth.h>
11
12#include "crypto.h"
9#include "auth_x.h" 13#include "auth_x.h"
10#include "auth_x_protocol.h" 14#include "auth_x_protocol.h"
11#include "crypto.h"
12#include "auth.h"
13#include "decode.h"
14 15
15#define TEMP_TICKET_BUF_LEN 256 16#define TEMP_TICKET_BUF_LEN 256
16 17
diff --git a/fs/ceph/auth_x.h b/net/ceph/auth_x.h
index ff6f8180e681..e02da7a5c5a1 100644
--- a/fs/ceph/auth_x.h
+++ b/net/ceph/auth_x.h
@@ -3,8 +3,9 @@
3 3
4#include <linux/rbtree.h> 4#include <linux/rbtree.h>
5 5
6#include <linux/ceph/auth.h>
7
6#include "crypto.h" 8#include "crypto.h"
7#include "auth.h"
8#include "auth_x_protocol.h" 9#include "auth_x_protocol.h"
9 10
10/* 11/*
diff --git a/fs/ceph/auth_x_protocol.h b/net/ceph/auth_x_protocol.h
index 671d30576c4f..671d30576c4f 100644
--- a/fs/ceph/auth_x_protocol.h
+++ b/net/ceph/auth_x_protocol.h
diff --git a/fs/ceph/buffer.c b/net/ceph/buffer.c
index cd39f17021de..53d8abfa25d5 100644
--- a/fs/ceph/buffer.c
+++ b/net/ceph/buffer.c
@@ -1,10 +1,11 @@
1 1
2#include "ceph_debug.h" 2#include <linux/ceph/ceph_debug.h>
3 3
4#include <linux/module.h>
4#include <linux/slab.h> 5#include <linux/slab.h>
5 6
6#include "buffer.h" 7#include <linux/ceph/buffer.h>
7#include "decode.h" 8#include <linux/ceph/decode.h>
8 9
9struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp) 10struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
10{ 11{
@@ -32,6 +33,7 @@ struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
32 dout("buffer_new %p\n", b); 33 dout("buffer_new %p\n", b);
33 return b; 34 return b;
34} 35}
36EXPORT_SYMBOL(ceph_buffer_new);
35 37
36void ceph_buffer_release(struct kref *kref) 38void ceph_buffer_release(struct kref *kref)
37{ 39{
@@ -46,6 +48,7 @@ void ceph_buffer_release(struct kref *kref)
46 } 48 }
47 kfree(b); 49 kfree(b);
48} 50}
51EXPORT_SYMBOL(ceph_buffer_release);
49 52
50int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end) 53int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end)
51{ 54{
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
new file mode 100644
index 000000000000..f3e4a13fea0c
--- /dev/null
+++ b/net/ceph/ceph_common.c
@@ -0,0 +1,529 @@
1
2#include <linux/ceph/ceph_debug.h>
3#include <linux/backing-dev.h>
4#include <linux/ctype.h>
5#include <linux/fs.h>
6#include <linux/inet.h>
7#include <linux/in6.h>
8#include <linux/module.h>
9#include <linux/mount.h>
10#include <linux/parser.h>
11#include <linux/sched.h>
12#include <linux/seq_file.h>
13#include <linux/slab.h>
14#include <linux/statfs.h>
15#include <linux/string.h>
16
17
18#include <linux/ceph/libceph.h>
19#include <linux/ceph/debugfs.h>
20#include <linux/ceph/decode.h>
21#include <linux/ceph/mon_client.h>
22#include <linux/ceph/auth.h>
23
24
25
26/*
27 * find filename portion of a path (/foo/bar/baz -> baz)
28 */
29const char *ceph_file_part(const char *s, int len)
30{
31 const char *e = s + len;
32
33 while (e != s && *(e-1) != '/')
34 e--;
35 return e;
36}
37EXPORT_SYMBOL(ceph_file_part);
38
39const char *ceph_msg_type_name(int type)
40{
41 switch (type) {
42 case CEPH_MSG_SHUTDOWN: return "shutdown";
43 case CEPH_MSG_PING: return "ping";
44 case CEPH_MSG_AUTH: return "auth";
45 case CEPH_MSG_AUTH_REPLY: return "auth_reply";
46 case CEPH_MSG_MON_MAP: return "mon_map";
47 case CEPH_MSG_MON_GET_MAP: return "mon_get_map";
48 case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe";
49 case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";
50 case CEPH_MSG_STATFS: return "statfs";
51 case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
52 case CEPH_MSG_MDS_MAP: return "mds_map";
53 case CEPH_MSG_CLIENT_SESSION: return "client_session";
54 case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
55 case CEPH_MSG_CLIENT_REQUEST: return "client_request";
56 case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward";
57 case CEPH_MSG_CLIENT_REPLY: return "client_reply";
58 case CEPH_MSG_CLIENT_CAPS: return "client_caps";
59 case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
60 case CEPH_MSG_CLIENT_SNAP: return "client_snap";
61 case CEPH_MSG_CLIENT_LEASE: return "client_lease";
62 case CEPH_MSG_OSD_MAP: return "osd_map";
63 case CEPH_MSG_OSD_OP: return "osd_op";
64 case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
65 default: return "unknown";
66 }
67}
68EXPORT_SYMBOL(ceph_msg_type_name);
69
70/*
71 * Initially learn our fsid, or verify an fsid matches.
72 */
73int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
74{
75 if (client->have_fsid) {
76 if (ceph_fsid_compare(&client->fsid, fsid)) {
77 pr_err("bad fsid, had %pU got %pU",
78 &client->fsid, fsid);
79 return -1;
80 }
81 } else {
82 pr_info("client%lld fsid %pU\n", ceph_client_id(client), fsid);
83 memcpy(&client->fsid, fsid, sizeof(*fsid));
84 ceph_debugfs_client_init(client);
85 client->have_fsid = true;
86 }
87 return 0;
88}
89EXPORT_SYMBOL(ceph_check_fsid);
90
91static int strcmp_null(const char *s1, const char *s2)
92{
93 if (!s1 && !s2)
94 return 0;
95 if (s1 && !s2)
96 return -1;
97 if (!s1 && s2)
98 return 1;
99 return strcmp(s1, s2);
100}
101
102int ceph_compare_options(struct ceph_options *new_opt,
103 struct ceph_client *client)
104{
105 struct ceph_options *opt1 = new_opt;
106 struct ceph_options *opt2 = client->options;
107 int ofs = offsetof(struct ceph_options, mon_addr);
108 int i;
109 int ret;
110
111 ret = memcmp(opt1, opt2, ofs);
112 if (ret)
113 return ret;
114
115 ret = strcmp_null(opt1->name, opt2->name);
116 if (ret)
117 return ret;
118
119 ret = strcmp_null(opt1->secret, opt2->secret);
120 if (ret)
121 return ret;
122
123 /* any matching mon ip implies a match */
124 for (i = 0; i < opt1->num_mon; i++) {
125 if (ceph_monmap_contains(client->monc.monmap,
126 &opt1->mon_addr[i]))
127 return 0;
128 }
129 return -1;
130}
131EXPORT_SYMBOL(ceph_compare_options);
132
133
134static int parse_fsid(const char *str, struct ceph_fsid *fsid)
135{
136 int i = 0;
137 char tmp[3];
138 int err = -EINVAL;
139 int d;
140
141 dout("parse_fsid '%s'\n", str);
142 tmp[2] = 0;
143 while (*str && i < 16) {
144 if (ispunct(*str)) {
145 str++;
146 continue;
147 }
148 if (!isxdigit(str[0]) || !isxdigit(str[1]))
149 break;
150 tmp[0] = str[0];
151 tmp[1] = str[1];
152 if (sscanf(tmp, "%x", &d) < 1)
153 break;
154 fsid->fsid[i] = d & 0xff;
155 i++;
156 str += 2;
157 }
158
159 if (i == 16)
160 err = 0;
161 dout("parse_fsid ret %d got fsid %pU", err, fsid);
162 return err;
163}
164
165/*
166 * ceph options
167 */
168enum {
169 Opt_osdtimeout,
170 Opt_osdkeepalivetimeout,
171 Opt_mount_timeout,
172 Opt_osd_idle_ttl,
173 Opt_last_int,
174 /* int args above */
175 Opt_fsid,
176 Opt_name,
177 Opt_secret,
178 Opt_ip,
179 Opt_last_string,
180 /* string args above */
181 Opt_noshare,
182 Opt_nocrc,
183};
184
185static match_table_t opt_tokens = {
186 {Opt_osdtimeout, "osdtimeout=%d"},
187 {Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
188 {Opt_mount_timeout, "mount_timeout=%d"},
189 {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
190 /* int args above */
191 {Opt_fsid, "fsid=%s"},
192 {Opt_name, "name=%s"},
193 {Opt_secret, "secret=%s"},
194 {Opt_ip, "ip=%s"},
195 /* string args above */
196 {Opt_noshare, "noshare"},
197 {Opt_nocrc, "nocrc"},
198 {-1, NULL}
199};
200
201void ceph_destroy_options(struct ceph_options *opt)
202{
203 dout("destroy_options %p\n", opt);
204 kfree(opt->name);
205 kfree(opt->secret);
206 kfree(opt);
207}
208EXPORT_SYMBOL(ceph_destroy_options);
209
210int ceph_parse_options(struct ceph_options **popt, char *options,
211 const char *dev_name, const char *dev_name_end,
212 int (*parse_extra_token)(char *c, void *private),
213 void *private)
214{
215 struct ceph_options *opt;
216 const char *c;
217 int err = -ENOMEM;
218 substring_t argstr[MAX_OPT_ARGS];
219
220 opt = kzalloc(sizeof(*opt), GFP_KERNEL);
221 if (!opt)
222 return err;
223 opt->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*opt->mon_addr),
224 GFP_KERNEL);
225 if (!opt->mon_addr)
226 goto out;
227
228 dout("parse_options %p options '%s' dev_name '%s'\n", opt, options,
229 dev_name);
230
231 /* start with defaults */
232 opt->flags = CEPH_OPT_DEFAULT;
233 opt->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT;
234 opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
235 opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
236 opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */
237
238 /* get mon ip(s) */
239 /* ip1[:port1][,ip2[:port2]...] */
240 err = ceph_parse_ips(dev_name, dev_name_end, opt->mon_addr,
241 CEPH_MAX_MON, &opt->num_mon);
242 if (err < 0)
243 goto out;
244
245 /* parse mount options */
246 while ((c = strsep(&options, ",")) != NULL) {
247 int token, intval, ret;
248 if (!*c)
249 continue;
250 err = -EINVAL;
251 token = match_token((char *)c, opt_tokens, argstr);
252 if (token < 0 && parse_extra_token) {
253 /* extra? */
254 err = parse_extra_token((char *)c, private);
255 if (err < 0) {
256 pr_err("bad option at '%s'\n", c);
257 goto out;
258 }
259 continue;
260 }
261 if (token < Opt_last_int) {
262 ret = match_int(&argstr[0], &intval);
263 if (ret < 0) {
264 pr_err("bad mount option arg (not int) "
265 "at '%s'\n", c);
266 continue;
267 }
268 dout("got int token %d val %d\n", token, intval);
269 } else if (token > Opt_last_int && token < Opt_last_string) {
270 dout("got string token %d val %s\n", token,
271 argstr[0].from);
272 } else {
273 dout("got token %d\n", token);
274 }
275 switch (token) {
276 case Opt_ip:
277 err = ceph_parse_ips(argstr[0].from,
278 argstr[0].to,
279 &opt->my_addr,
280 1, NULL);
281 if (err < 0)
282 goto out;
283 opt->flags |= CEPH_OPT_MYIP;
284 break;
285
286 case Opt_fsid:
287 err = parse_fsid(argstr[0].from, &opt->fsid);
288 if (err == 0)
289 opt->flags |= CEPH_OPT_FSID;
290 break;
291 case Opt_name:
292 opt->name = kstrndup(argstr[0].from,
293 argstr[0].to-argstr[0].from,
294 GFP_KERNEL);
295 break;
296 case Opt_secret:
297 opt->secret = kstrndup(argstr[0].from,
298 argstr[0].to-argstr[0].from,
299 GFP_KERNEL);
300 break;
301
302 /* misc */
303 case Opt_osdtimeout:
304 opt->osd_timeout = intval;
305 break;
306 case Opt_osdkeepalivetimeout:
307 opt->osd_keepalive_timeout = intval;
308 break;
309 case Opt_osd_idle_ttl:
310 opt->osd_idle_ttl = intval;
311 break;
312 case Opt_mount_timeout:
313 opt->mount_timeout = intval;
314 break;
315
316 case Opt_noshare:
317 opt->flags |= CEPH_OPT_NOSHARE;
318 break;
319
320 case Opt_nocrc:
321 opt->flags |= CEPH_OPT_NOCRC;
322 break;
323
324 default:
325 BUG_ON(token);
326 }
327 }
328
329 /* success */
330 *popt = opt;
331 return 0;
332
333out:
334 ceph_destroy_options(opt);
335 return err;
336}
337EXPORT_SYMBOL(ceph_parse_options);
338
339u64 ceph_client_id(struct ceph_client *client)
340{
341 return client->monc.auth->global_id;
342}
343EXPORT_SYMBOL(ceph_client_id);
344
345/*
346 * create a fresh client instance
347 */
348struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private)
349{
350 struct ceph_client *client;
351 int err = -ENOMEM;
352
353 client = kzalloc(sizeof(*client), GFP_KERNEL);
354 if (client == NULL)
355 return ERR_PTR(-ENOMEM);
356
357 client->private = private;
358 client->options = opt;
359
360 mutex_init(&client->mount_mutex);
361 init_waitqueue_head(&client->auth_wq);
362 client->auth_err = 0;
363
364 client->extra_mon_dispatch = NULL;
365 client->supported_features = CEPH_FEATURE_SUPPORTED_DEFAULT;
366 client->required_features = CEPH_FEATURE_REQUIRED_DEFAULT;
367
368 client->msgr = NULL;
369
370 /* subsystems */
371 err = ceph_monc_init(&client->monc, client);
372 if (err < 0)
373 goto fail;
374 err = ceph_osdc_init(&client->osdc, client);
375 if (err < 0)
376 goto fail_monc;
377
378 return client;
379
380fail_monc:
381 ceph_monc_stop(&client->monc);
382fail:
383 kfree(client);
384 return ERR_PTR(err);
385}
386EXPORT_SYMBOL(ceph_create_client);
387
388void ceph_destroy_client(struct ceph_client *client)
389{
390 dout("destroy_client %p\n", client);
391
392 /* unmount */
393 ceph_osdc_stop(&client->osdc);
394
395 /*
396 * make sure mds and osd connections close out before destroying
397 * the auth module, which is needed to free those connections'
398 * ceph_authorizers.
399 */
400 ceph_msgr_flush();
401
402 ceph_monc_stop(&client->monc);
403
404 ceph_debugfs_client_cleanup(client);
405
406 if (client->msgr)
407 ceph_messenger_destroy(client->msgr);
408
409 ceph_destroy_options(client->options);
410
411 kfree(client);
412 dout("destroy_client %p done\n", client);
413}
414EXPORT_SYMBOL(ceph_destroy_client);
415
416/*
417 * true if we have the mon map (and have thus joined the cluster)
418 */
419static int have_mon_and_osd_map(struct ceph_client *client)
420{
421 return client->monc.monmap && client->monc.monmap->epoch &&
422 client->osdc.osdmap && client->osdc.osdmap->epoch;
423}
424
425/*
426 * mount: join the ceph cluster, and open root directory.
427 */
428int __ceph_open_session(struct ceph_client *client, unsigned long started)
429{
430 struct ceph_entity_addr *myaddr = NULL;
431 int err;
432 unsigned long timeout = client->options->mount_timeout * HZ;
433
434 /* initialize the messenger */
435 if (client->msgr == NULL) {
436 if (ceph_test_opt(client, MYIP))
437 myaddr = &client->options->my_addr;
438 client->msgr = ceph_messenger_create(myaddr,
439 client->supported_features,
440 client->required_features);
441 if (IS_ERR(client->msgr)) {
442 client->msgr = NULL;
443 return PTR_ERR(client->msgr);
444 }
445 client->msgr->nocrc = ceph_test_opt(client, NOCRC);
446 }
447
448 /* open session, and wait for mon and osd maps */
449 err = ceph_monc_open_session(&client->monc);
450 if (err < 0)
451 return err;
452
453 while (!have_mon_and_osd_map(client)) {
454 err = -EIO;
455 if (timeout && time_after_eq(jiffies, started + timeout))
456 return err;
457
458 /* wait */
459 dout("mount waiting for mon_map\n");
460 err = wait_event_interruptible_timeout(client->auth_wq,
461 have_mon_and_osd_map(client) || (client->auth_err < 0),
462 timeout);
463 if (err == -EINTR || err == -ERESTARTSYS)
464 return err;
465 if (client->auth_err < 0)
466 return client->auth_err;
467 }
468
469 return 0;
470}
471EXPORT_SYMBOL(__ceph_open_session);
472
473
474int ceph_open_session(struct ceph_client *client)
475{
476 int ret;
477 unsigned long started = jiffies; /* note the start time */
478
479 dout("open_session start\n");
480 mutex_lock(&client->mount_mutex);
481
482 ret = __ceph_open_session(client, started);
483
484 mutex_unlock(&client->mount_mutex);
485 return ret;
486}
487EXPORT_SYMBOL(ceph_open_session);
488
489
490static int __init init_ceph_lib(void)
491{
492 int ret = 0;
493
494 ret = ceph_debugfs_init();
495 if (ret < 0)
496 goto out;
497
498 ret = ceph_msgr_init();
499 if (ret < 0)
500 goto out_debugfs;
501
502 pr_info("loaded (mon/osd proto %d/%d, osdmap %d/%d %d/%d)\n",
503 CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL,
504 CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT,
505 CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT);
506
507 return 0;
508
509out_debugfs:
510 ceph_debugfs_cleanup();
511out:
512 return ret;
513}
514
515static void __exit exit_ceph_lib(void)
516{
517 dout("exit_ceph_lib\n");
518 ceph_msgr_exit();
519 ceph_debugfs_cleanup();
520}
521
522module_init(init_ceph_lib);
523module_exit(exit_ceph_lib);
524
525MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
526MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
527MODULE_AUTHOR("Patience Warnick <patience@newdream.net>");
528MODULE_DESCRIPTION("Ceph filesystem for Linux");
529MODULE_LICENSE("GPL");
diff --git a/fs/ceph/ceph_fs.c b/net/ceph/ceph_fs.c
index 3ac6cc7c1156..a3a3a31d3c37 100644
--- a/fs/ceph/ceph_fs.c
+++ b/net/ceph/ceph_fs.c
@@ -1,7 +1,8 @@
1/* 1/*
2 * Some non-inline ceph helpers 2 * Some non-inline ceph helpers
3 */ 3 */
4#include "types.h" 4#include <linux/module.h>
5#include <linux/ceph/types.h>
5 6
6/* 7/*
7 * return true if @layout appears to be valid 8 * return true if @layout appears to be valid
@@ -52,6 +53,7 @@ int ceph_flags_to_mode(int flags)
52 53
53 return mode; 54 return mode;
54} 55}
56EXPORT_SYMBOL(ceph_flags_to_mode);
55 57
56int ceph_caps_for_mode(int mode) 58int ceph_caps_for_mode(int mode)
57{ 59{
@@ -70,3 +72,4 @@ int ceph_caps_for_mode(int mode)
70 72
71 return caps; 73 return caps;
72} 74}
75EXPORT_SYMBOL(ceph_caps_for_mode);
diff --git a/fs/ceph/ceph_hash.c b/net/ceph/ceph_hash.c
index bd570015d147..815ef8826796 100644
--- a/fs/ceph/ceph_hash.c
+++ b/net/ceph/ceph_hash.c
@@ -1,5 +1,5 @@
1 1
2#include "types.h" 2#include <linux/ceph/types.h>
3 3
4/* 4/*
5 * Robert Jenkin's hash function. 5 * Robert Jenkin's hash function.
diff --git a/net/ceph/ceph_strings.c b/net/ceph/ceph_strings.c
new file mode 100644
index 000000000000..3fbda04de29c
--- /dev/null
+++ b/net/ceph/ceph_strings.c
@@ -0,0 +1,84 @@
1/*
2 * Ceph string constants
3 */
4#include <linux/module.h>
5#include <linux/ceph/types.h>
6
7const char *ceph_entity_type_name(int type)
8{
9 switch (type) {
10 case CEPH_ENTITY_TYPE_MDS: return "mds";
11 case CEPH_ENTITY_TYPE_OSD: return "osd";
12 case CEPH_ENTITY_TYPE_MON: return "mon";
13 case CEPH_ENTITY_TYPE_CLIENT: return "client";
14 case CEPH_ENTITY_TYPE_AUTH: return "auth";
15 default: return "unknown";
16 }
17}
18
19const char *ceph_osd_op_name(int op)
20{
21 switch (op) {
22 case CEPH_OSD_OP_READ: return "read";
23 case CEPH_OSD_OP_STAT: return "stat";
24
25 case CEPH_OSD_OP_MASKTRUNC: return "masktrunc";
26
27 case CEPH_OSD_OP_WRITE: return "write";
28 case CEPH_OSD_OP_DELETE: return "delete";
29 case CEPH_OSD_OP_TRUNCATE: return "truncate";
30 case CEPH_OSD_OP_ZERO: return "zero";
31 case CEPH_OSD_OP_WRITEFULL: return "writefull";
32 case CEPH_OSD_OP_ROLLBACK: return "rollback";
33
34 case CEPH_OSD_OP_APPEND: return "append";
35 case CEPH_OSD_OP_STARTSYNC: return "startsync";
36 case CEPH_OSD_OP_SETTRUNC: return "settrunc";
37 case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc";
38
39 case CEPH_OSD_OP_TMAPUP: return "tmapup";
40 case CEPH_OSD_OP_TMAPGET: return "tmapget";
41 case CEPH_OSD_OP_TMAPPUT: return "tmapput";
42
43 case CEPH_OSD_OP_GETXATTR: return "getxattr";
44 case CEPH_OSD_OP_GETXATTRS: return "getxattrs";
45 case CEPH_OSD_OP_SETXATTR: return "setxattr";
46 case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
47 case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
48 case CEPH_OSD_OP_RMXATTR: return "rmxattr";
49 case CEPH_OSD_OP_CMPXATTR: return "cmpxattr";
50
51 case CEPH_OSD_OP_PULL: return "pull";
52 case CEPH_OSD_OP_PUSH: return "push";
53 case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";
54 case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";
55 case CEPH_OSD_OP_SCRUB: return "scrub";
56
57 case CEPH_OSD_OP_WRLOCK: return "wrlock";
58 case CEPH_OSD_OP_WRUNLOCK: return "wrunlock";
59 case CEPH_OSD_OP_RDLOCK: return "rdlock";
60 case CEPH_OSD_OP_RDUNLOCK: return "rdunlock";
61 case CEPH_OSD_OP_UPLOCK: return "uplock";
62 case CEPH_OSD_OP_DNLOCK: return "dnlock";
63
64 case CEPH_OSD_OP_CALL: return "call";
65
66 case CEPH_OSD_OP_PGLS: return "pgls";
67 }
68 return "???";
69}
70
71
72const char *ceph_pool_op_name(int op)
73{
74 switch (op) {
75 case POOL_OP_CREATE: return "create";
76 case POOL_OP_DELETE: return "delete";
77 case POOL_OP_AUID_CHANGE: return "auid change";
78 case POOL_OP_CREATE_SNAP: return "create snap";
79 case POOL_OP_DELETE_SNAP: return "delete snap";
80 case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap";
81 case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap";
82 }
83 return "???";
84}
diff --git a/fs/ceph/crush/crush.c b/net/ceph/crush/crush.c
index fabd302e5779..d6ebb13a18a4 100644
--- a/fs/ceph/crush/crush.c
+++ b/net/ceph/crush/crush.c
@@ -8,7 +8,7 @@
8# define BUG_ON(x) assert(!(x)) 8# define BUG_ON(x) assert(!(x))
9#endif 9#endif
10 10
11#include "crush.h" 11#include <linux/crush/crush.h>
12 12
13const char *crush_bucket_alg_name(int alg) 13const char *crush_bucket_alg_name(int alg)
14{ 14{
diff --git a/fs/ceph/crush/hash.c b/net/ceph/crush/hash.c
index 5873aed694bf..5bb63e37a8a1 100644
--- a/fs/ceph/crush/hash.c
+++ b/net/ceph/crush/hash.c
@@ -1,6 +1,6 @@
1 1
2#include <linux/types.h> 2#include <linux/types.h>
3#include "hash.h" 3#include <linux/crush/hash.h>
4 4
5/* 5/*
6 * Robert Jenkins' function for mixing 32-bit values 6 * Robert Jenkins' function for mixing 32-bit values
diff --git a/fs/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index a4eec133258e..42599e31dcad 100644
--- a/fs/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -18,8 +18,8 @@
18# define kfree(x) free(x) 18# define kfree(x) free(x)
19#endif 19#endif
20 20
21#include "crush.h" 21#include <linux/crush/crush.h>
22#include "hash.h" 22#include <linux/crush/hash.h>
23 23
24/* 24/*
25 * Implement the core CRUSH mapping algorithm. 25 * Implement the core CRUSH mapping algorithm.
diff --git a/fs/ceph/crypto.c b/net/ceph/crypto.c
index a3e627f63293..7b505b0c983f 100644
--- a/fs/ceph/crypto.c
+++ b/net/ceph/crypto.c
@@ -1,13 +1,13 @@
1 1
2#include "ceph_debug.h" 2#include <linux/ceph/ceph_debug.h>
3 3
4#include <linux/err.h> 4#include <linux/err.h>
5#include <linux/scatterlist.h> 5#include <linux/scatterlist.h>
6#include <linux/slab.h> 6#include <linux/slab.h>
7#include <crypto/hash.h> 7#include <crypto/hash.h>
8 8
9#include <linux/ceph/decode.h>
9#include "crypto.h" 10#include "crypto.h"
10#include "decode.h"
11 11
12int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end) 12int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end)
13{ 13{
diff --git a/fs/ceph/crypto.h b/net/ceph/crypto.h
index bdf38607323c..f9eccace592b 100644
--- a/fs/ceph/crypto.h
+++ b/net/ceph/crypto.h
@@ -1,8 +1,8 @@
1#ifndef _FS_CEPH_CRYPTO_H 1#ifndef _FS_CEPH_CRYPTO_H
2#define _FS_CEPH_CRYPTO_H 2#define _FS_CEPH_CRYPTO_H
3 3
4#include "types.h" 4#include <linux/ceph/types.h>
5#include "buffer.h" 5#include <linux/ceph/buffer.h>
6 6
7/* 7/*
8 * cryptographic secret 8 * cryptographic secret
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
new file mode 100644
index 000000000000..27d4ea315d12
--- /dev/null
+++ b/net/ceph/debugfs.c
@@ -0,0 +1,267 @@
1#include <linux/ceph/ceph_debug.h>
2
3#include <linux/device.h>
4#include <linux/slab.h>
5#include <linux/module.h>
6#include <linux/ctype.h>
7#include <linux/debugfs.h>
8#include <linux/seq_file.h>
9
10#include <linux/ceph/libceph.h>
11#include <linux/ceph/mon_client.h>
12#include <linux/ceph/auth.h>
13#include <linux/ceph/debugfs.h>
14
15#ifdef CONFIG_DEBUG_FS
16
17/*
18 * Implement /sys/kernel/debug/ceph fun
19 *
20 * /sys/kernel/debug/ceph/client* - an instance of the ceph client
21 * .../osdmap - current osdmap
22 * .../monmap - current monmap
23 * .../osdc - active osd requests
24 * .../monc - mon client state
25 * .../dentry_lru - dump contents of dentry lru
26 * .../caps - expose cap (reservation) stats
27 * .../bdi - symlink to ../../bdi/something
28 */
29
30static struct dentry *ceph_debugfs_dir;
31
32static int monmap_show(struct seq_file *s, void *p)
33{
34 int i;
35 struct ceph_client *client = s->private;
36
37 if (client->monc.monmap == NULL)
38 return 0;
39
40 seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
41 for (i = 0; i < client->monc.monmap->num_mon; i++) {
42 struct ceph_entity_inst *inst =
43 &client->monc.monmap->mon_inst[i];
44
45 seq_printf(s, "\t%s%lld\t%s\n",
46 ENTITY_NAME(inst->name),
47 ceph_pr_addr(&inst->addr.in_addr));
48 }
49 return 0;
50}
51
52static int osdmap_show(struct seq_file *s, void *p)
53{
54 int i;
55 struct ceph_client *client = s->private;
56 struct rb_node *n;
57
58 if (client->osdc.osdmap == NULL)
59 return 0;
60 seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch);
61 seq_printf(s, "flags%s%s\n",
62 (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ?
63 " NEARFULL" : "",
64 (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
65 " FULL" : "");
66 for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
67 struct ceph_pg_pool_info *pool =
68 rb_entry(n, struct ceph_pg_pool_info, node);
69 seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n",
70 pool->id, pool->v.pg_num, pool->pg_num_mask,
71 pool->v.lpg_num, pool->lpg_num_mask);
72 }
73 for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
74 struct ceph_entity_addr *addr =
75 &client->osdc.osdmap->osd_addr[i];
76 int state = client->osdc.osdmap->osd_state[i];
77 char sb[64];
78
79 seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n",
80 i, ceph_pr_addr(&addr->in_addr),
81 ((client->osdc.osdmap->osd_weight[i]*100) >> 16),
82 ceph_osdmap_state_str(sb, sizeof(sb), state));
83 }
84 return 0;
85}
86
87static int monc_show(struct seq_file *s, void *p)
88{
89 struct ceph_client *client = s->private;
90 struct ceph_mon_generic_request *req;
91 struct ceph_mon_client *monc = &client->monc;
92 struct rb_node *rp;
93
94 mutex_lock(&monc->mutex);
95
96 if (monc->have_mdsmap)
97 seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap);
98 if (monc->have_osdmap)
99 seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap);
100 if (monc->want_next_osdmap)
101 seq_printf(s, "want next osdmap\n");
102
103 for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
104 __u16 op;
105 req = rb_entry(rp, struct ceph_mon_generic_request, node);
106 op = le16_to_cpu(req->request->hdr.type);
107 if (op == CEPH_MSG_STATFS)
108 seq_printf(s, "%lld statfs\n", req->tid);
109 else
110 seq_printf(s, "%lld unknown\n", req->tid);
111 }
112
113 mutex_unlock(&monc->mutex);
114 return 0;
115}
116
117static int osdc_show(struct seq_file *s, void *pp)
118{
119 struct ceph_client *client = s->private;
120 struct ceph_osd_client *osdc = &client->osdc;
121 struct rb_node *p;
122
123 mutex_lock(&osdc->request_mutex);
124 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
125 struct ceph_osd_request *req;
126 struct ceph_osd_request_head *head;
127 struct ceph_osd_op *op;
128 int num_ops;
129 int opcode, olen;
130 int i;
131
132 req = rb_entry(p, struct ceph_osd_request, r_node);
133
134 seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid,
135 req->r_osd ? req->r_osd->o_osd : -1,
136 le32_to_cpu(req->r_pgid.pool),
137 le16_to_cpu(req->r_pgid.ps));
138
139 head = req->r_request->front.iov_base;
140 op = (void *)(head + 1);
141
142 num_ops = le16_to_cpu(head->num_ops);
143 olen = le32_to_cpu(head->object_len);
144 seq_printf(s, "%.*s", olen,
145 (const char *)(head->ops + num_ops));
146
147 if (req->r_reassert_version.epoch)
148 seq_printf(s, "\t%u'%llu",
149 (unsigned)le32_to_cpu(req->r_reassert_version.epoch),
150 le64_to_cpu(req->r_reassert_version.version));
151 else
152 seq_printf(s, "\t");
153
154 for (i = 0; i < num_ops; i++) {
155 opcode = le16_to_cpu(op->op);
156 seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
157 op++;
158 }
159
160 seq_printf(s, "\n");
161 }
162 mutex_unlock(&osdc->request_mutex);
163 return 0;
164}
165
166CEPH_DEFINE_SHOW_FUNC(monmap_show)
167CEPH_DEFINE_SHOW_FUNC(osdmap_show)
168CEPH_DEFINE_SHOW_FUNC(monc_show)
169CEPH_DEFINE_SHOW_FUNC(osdc_show)
170
171int ceph_debugfs_init(void)
172{
173 ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
174 if (!ceph_debugfs_dir)
175 return -ENOMEM;
176 return 0;
177}
178
179void ceph_debugfs_cleanup(void)
180{
181 debugfs_remove(ceph_debugfs_dir);
182}
183
184int ceph_debugfs_client_init(struct ceph_client *client)
185{
186 int ret = -ENOMEM;
187 char name[80];
188
189 snprintf(name, sizeof(name), "%pU.client%lld", &client->fsid,
190 client->monc.auth->global_id);
191
192 client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
193 if (!client->debugfs_dir)
194 goto out;
195
196 client->monc.debugfs_file = debugfs_create_file("monc",
197 0600,
198 client->debugfs_dir,
199 client,
200 &monc_show_fops);
201 if (!client->monc.debugfs_file)
202 goto out;
203
204 client->osdc.debugfs_file = debugfs_create_file("osdc",
205 0600,
206 client->debugfs_dir,
207 client,
208 &osdc_show_fops);
209 if (!client->osdc.debugfs_file)
210 goto out;
211
212 client->debugfs_monmap = debugfs_create_file("monmap",
213 0600,
214 client->debugfs_dir,
215 client,
216 &monmap_show_fops);
217 if (!client->debugfs_monmap)
218 goto out;
219
220 client->debugfs_osdmap = debugfs_create_file("osdmap",
221 0600,
222 client->debugfs_dir,
223 client,
224 &osdmap_show_fops);
225 if (!client->debugfs_osdmap)
226 goto out;
227
228 return 0;
229
230out:
231 ceph_debugfs_client_cleanup(client);
232 return ret;
233}
234
235void ceph_debugfs_client_cleanup(struct ceph_client *client)
236{
237 debugfs_remove(client->debugfs_osdmap);
238 debugfs_remove(client->debugfs_monmap);
239 debugfs_remove(client->osdc.debugfs_file);
240 debugfs_remove(client->monc.debugfs_file);
241 debugfs_remove(client->debugfs_dir);
242}
243
244#else /* CONFIG_DEBUG_FS */
245
246int ceph_debugfs_init(void)
247{
248 return 0;
249}
250
251void ceph_debugfs_cleanup(void)
252{
253}
254
255int ceph_debugfs_client_init(struct ceph_client *client)
256{
257 return 0;
258}
259
260void ceph_debugfs_client_cleanup(struct ceph_client *client)
261{
262}
263
264#endif /* CONFIG_DEBUG_FS */
265
266EXPORT_SYMBOL(ceph_debugfs_init);
267EXPORT_SYMBOL(ceph_debugfs_cleanup);
diff --git a/fs/ceph/messenger.c b/net/ceph/messenger.c
index 2502d76fcec1..0e8157ee5d43 100644
--- a/fs/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1,4 +1,4 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/crc32c.h> 3#include <linux/crc32c.h>
4#include <linux/ctype.h> 4#include <linux/ctype.h>
@@ -9,12 +9,14 @@
9#include <linux/slab.h> 9#include <linux/slab.h>
10#include <linux/socket.h> 10#include <linux/socket.h>
11#include <linux/string.h> 11#include <linux/string.h>
12#include <linux/bio.h>
13#include <linux/blkdev.h>
12#include <net/tcp.h> 14#include <net/tcp.h>
13 15
14#include "super.h" 16#include <linux/ceph/libceph.h>
15#include "messenger.h" 17#include <linux/ceph/messenger.h>
16#include "decode.h" 18#include <linux/ceph/decode.h>
17#include "pagelist.h" 19#include <linux/ceph/pagelist.h>
18 20
19/* 21/*
20 * Ceph uses the messenger to exchange ceph_msg messages with other 22 * Ceph uses the messenger to exchange ceph_msg messages with other
@@ -48,7 +50,7 @@ static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN];
48static DEFINE_SPINLOCK(addr_str_lock); 50static DEFINE_SPINLOCK(addr_str_lock);
49static int last_addr_str; 51static int last_addr_str;
50 52
51const char *pr_addr(const struct sockaddr_storage *ss) 53const char *ceph_pr_addr(const struct sockaddr_storage *ss)
52{ 54{
53 int i; 55 int i;
54 char *s; 56 char *s;
@@ -79,6 +81,7 @@ const char *pr_addr(const struct sockaddr_storage *ss)
79 81
80 return s; 82 return s;
81} 83}
84EXPORT_SYMBOL(ceph_pr_addr);
82 85
83static void encode_my_addr(struct ceph_messenger *msgr) 86static void encode_my_addr(struct ceph_messenger *msgr)
84{ 87{
@@ -91,7 +94,7 @@ static void encode_my_addr(struct ceph_messenger *msgr)
91 */ 94 */
92struct workqueue_struct *ceph_msgr_wq; 95struct workqueue_struct *ceph_msgr_wq;
93 96
94int __init ceph_msgr_init(void) 97int ceph_msgr_init(void)
95{ 98{
96 ceph_msgr_wq = create_workqueue("ceph-msgr"); 99 ceph_msgr_wq = create_workqueue("ceph-msgr");
97 if (IS_ERR(ceph_msgr_wq)) { 100 if (IS_ERR(ceph_msgr_wq)) {
@@ -102,16 +105,19 @@ int __init ceph_msgr_init(void)
102 } 105 }
103 return 0; 106 return 0;
104} 107}
108EXPORT_SYMBOL(ceph_msgr_init);
105 109
106void ceph_msgr_exit(void) 110void ceph_msgr_exit(void)
107{ 111{
108 destroy_workqueue(ceph_msgr_wq); 112 destroy_workqueue(ceph_msgr_wq);
109} 113}
114EXPORT_SYMBOL(ceph_msgr_exit);
110 115
111void ceph_msgr_flush(void) 116void ceph_msgr_flush(void)
112{ 117{
113 flush_workqueue(ceph_msgr_wq); 118 flush_workqueue(ceph_msgr_wq);
114} 119}
120EXPORT_SYMBOL(ceph_msgr_flush);
115 121
116 122
117/* 123/*
@@ -221,19 +227,19 @@ static struct socket *ceph_tcp_connect(struct ceph_connection *con)
221 227
222 set_sock_callbacks(sock, con); 228 set_sock_callbacks(sock, con);
223 229
224 dout("connect %s\n", pr_addr(&con->peer_addr.in_addr)); 230 dout("connect %s\n", ceph_pr_addr(&con->peer_addr.in_addr));
225 231
226 ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr), 232 ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr),
227 O_NONBLOCK); 233 O_NONBLOCK);
228 if (ret == -EINPROGRESS) { 234 if (ret == -EINPROGRESS) {
229 dout("connect %s EINPROGRESS sk_state = %u\n", 235 dout("connect %s EINPROGRESS sk_state = %u\n",
230 pr_addr(&con->peer_addr.in_addr), 236 ceph_pr_addr(&con->peer_addr.in_addr),
231 sock->sk->sk_state); 237 sock->sk->sk_state);
232 ret = 0; 238 ret = 0;
233 } 239 }
234 if (ret < 0) { 240 if (ret < 0) {
235 pr_err("connect %s error %d\n", 241 pr_err("connect %s error %d\n",
236 pr_addr(&con->peer_addr.in_addr), ret); 242 ceph_pr_addr(&con->peer_addr.in_addr), ret);
237 sock_release(sock); 243 sock_release(sock);
238 con->sock = NULL; 244 con->sock = NULL;
239 con->error_msg = "connect error"; 245 con->error_msg = "connect error";
@@ -334,7 +340,8 @@ static void reset_connection(struct ceph_connection *con)
334 */ 340 */
335void ceph_con_close(struct ceph_connection *con) 341void ceph_con_close(struct ceph_connection *con)
336{ 342{
337 dout("con_close %p peer %s\n", con, pr_addr(&con->peer_addr.in_addr)); 343 dout("con_close %p peer %s\n", con,
344 ceph_pr_addr(&con->peer_addr.in_addr));
338 set_bit(CLOSED, &con->state); /* in case there's queued work */ 345 set_bit(CLOSED, &con->state); /* in case there's queued work */
339 clear_bit(STANDBY, &con->state); /* avoid connect_seq bump */ 346 clear_bit(STANDBY, &con->state); /* avoid connect_seq bump */
340 clear_bit(LOSSYTX, &con->state); /* so we retry next connect */ 347 clear_bit(LOSSYTX, &con->state); /* so we retry next connect */
@@ -347,19 +354,21 @@ void ceph_con_close(struct ceph_connection *con)
347 mutex_unlock(&con->mutex); 354 mutex_unlock(&con->mutex);
348 queue_con(con); 355 queue_con(con);
349} 356}
357EXPORT_SYMBOL(ceph_con_close);
350 358
351/* 359/*
352 * Reopen a closed connection, with a new peer address. 360 * Reopen a closed connection, with a new peer address.
353 */ 361 */
354void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr) 362void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr)
355{ 363{
356 dout("con_open %p %s\n", con, pr_addr(&addr->in_addr)); 364 dout("con_open %p %s\n", con, ceph_pr_addr(&addr->in_addr));
357 set_bit(OPENING, &con->state); 365 set_bit(OPENING, &con->state);
358 clear_bit(CLOSED, &con->state); 366 clear_bit(CLOSED, &con->state);
359 memcpy(&con->peer_addr, addr, sizeof(*addr)); 367 memcpy(&con->peer_addr, addr, sizeof(*addr));
360 con->delay = 0; /* reset backoff memory */ 368 con->delay = 0; /* reset backoff memory */
361 queue_con(con); 369 queue_con(con);
362} 370}
371EXPORT_SYMBOL(ceph_con_open);
363 372
364/* 373/*
365 * return true if this connection ever successfully opened 374 * return true if this connection ever successfully opened
@@ -406,6 +415,7 @@ void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con)
406 INIT_LIST_HEAD(&con->out_sent); 415 INIT_LIST_HEAD(&con->out_sent);
407 INIT_DELAYED_WORK(&con->work, con_work); 416 INIT_DELAYED_WORK(&con->work, con_work);
408} 417}
418EXPORT_SYMBOL(ceph_con_init);
409 419
410 420
411/* 421/*
@@ -529,8 +539,11 @@ static void prepare_write_message(struct ceph_connection *con)
529 if (le32_to_cpu(m->hdr.data_len) > 0) { 539 if (le32_to_cpu(m->hdr.data_len) > 0) {
530 /* initialize page iterator */ 540 /* initialize page iterator */
531 con->out_msg_pos.page = 0; 541 con->out_msg_pos.page = 0;
532 con->out_msg_pos.page_pos = 542 if (m->pages)
533 le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK; 543 con->out_msg_pos.page_pos =
544 le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK;
545 else
546 con->out_msg_pos.page_pos = 0;
534 con->out_msg_pos.data_pos = 0; 547 con->out_msg_pos.data_pos = 0;
535 con->out_msg_pos.did_page_crc = 0; 548 con->out_msg_pos.did_page_crc = 0;
536 con->out_more = 1; /* data + footer will follow */ 549 con->out_more = 1; /* data + footer will follow */
@@ -647,7 +660,7 @@ static void prepare_write_connect(struct ceph_messenger *msgr,
647 dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con, 660 dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
648 con->connect_seq, global_seq, proto); 661 con->connect_seq, global_seq, proto);
649 662
650 con->out_connect.features = cpu_to_le64(CEPH_FEATURE_SUPPORTED); 663 con->out_connect.features = cpu_to_le64(msgr->supported_features);
651 con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT); 664 con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
652 con->out_connect.connect_seq = cpu_to_le32(con->connect_seq); 665 con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
653 con->out_connect.global_seq = cpu_to_le32(global_seq); 666 con->out_connect.global_seq = cpu_to_le32(global_seq);
@@ -712,6 +725,31 @@ out:
712 return ret; /* done! */ 725 return ret; /* done! */
713} 726}
714 727
728#ifdef CONFIG_BLOCK
729static void init_bio_iter(struct bio *bio, struct bio **iter, int *seg)
730{
731 if (!bio) {
732 *iter = NULL;
733 *seg = 0;
734 return;
735 }
736 *iter = bio;
737 *seg = bio->bi_idx;
738}
739
740static void iter_bio_next(struct bio **bio_iter, int *seg)
741{
742 if (*bio_iter == NULL)
743 return;
744
745 BUG_ON(*seg >= (*bio_iter)->bi_vcnt);
746
747 (*seg)++;
748 if (*seg == (*bio_iter)->bi_vcnt)
749 init_bio_iter((*bio_iter)->bi_next, bio_iter, seg);
750}
751#endif
752
715/* 753/*
716 * Write as much message data payload as we can. If we finish, queue 754 * Write as much message data payload as we can. If we finish, queue
717 * up the footer. 755 * up the footer.
@@ -726,21 +764,46 @@ static int write_partial_msg_pages(struct ceph_connection *con)
726 size_t len; 764 size_t len;
727 int crc = con->msgr->nocrc; 765 int crc = con->msgr->nocrc;
728 int ret; 766 int ret;
767 int total_max_write;
768 int in_trail = 0;
769 size_t trail_len = (msg->trail ? msg->trail->length : 0);
729 770
730 dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n", 771 dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n",
731 con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages, 772 con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages,
732 con->out_msg_pos.page_pos); 773 con->out_msg_pos.page_pos);
733 774
734 while (con->out_msg_pos.page < con->out_msg->nr_pages) { 775#ifdef CONFIG_BLOCK
776 if (msg->bio && !msg->bio_iter)
777 init_bio_iter(msg->bio, &msg->bio_iter, &msg->bio_seg);
778#endif
779
780 while (data_len > con->out_msg_pos.data_pos) {
735 struct page *page = NULL; 781 struct page *page = NULL;
736 void *kaddr = NULL; 782 void *kaddr = NULL;
783 int max_write = PAGE_SIZE;
784 int page_shift = 0;
785
786 total_max_write = data_len - trail_len -
787 con->out_msg_pos.data_pos;
737 788
738 /* 789 /*
739 * if we are calculating the data crc (the default), we need 790 * if we are calculating the data crc (the default), we need
740 * to map the page. if our pages[] has been revoked, use the 791 * to map the page. if our pages[] has been revoked, use the
741 * zero page. 792 * zero page.
742 */ 793 */
743 if (msg->pages) { 794
795 /* have we reached the trail part of the data? */
796 if (con->out_msg_pos.data_pos >= data_len - trail_len) {
797 in_trail = 1;
798
799 total_max_write = data_len - con->out_msg_pos.data_pos;
800
801 page = list_first_entry(&msg->trail->head,
802 struct page, lru);
803 if (crc)
804 kaddr = kmap(page);
805 max_write = PAGE_SIZE;
806 } else if (msg->pages) {
744 page = msg->pages[con->out_msg_pos.page]; 807 page = msg->pages[con->out_msg_pos.page];
745 if (crc) 808 if (crc)
746 kaddr = kmap(page); 809 kaddr = kmap(page);
@@ -749,13 +812,25 @@ static int write_partial_msg_pages(struct ceph_connection *con)
749 struct page, lru); 812 struct page, lru);
750 if (crc) 813 if (crc)
751 kaddr = kmap(page); 814 kaddr = kmap(page);
815#ifdef CONFIG_BLOCK
816 } else if (msg->bio) {
817 struct bio_vec *bv;
818
819 bv = bio_iovec_idx(msg->bio_iter, msg->bio_seg);
820 page = bv->bv_page;
821 page_shift = bv->bv_offset;
822 if (crc)
823 kaddr = kmap(page) + page_shift;
824 max_write = bv->bv_len;
825#endif
752 } else { 826 } else {
753 page = con->msgr->zero_page; 827 page = con->msgr->zero_page;
754 if (crc) 828 if (crc)
755 kaddr = page_address(con->msgr->zero_page); 829 kaddr = page_address(con->msgr->zero_page);
756 } 830 }
757 len = min((int)(PAGE_SIZE - con->out_msg_pos.page_pos), 831 len = min_t(int, max_write - con->out_msg_pos.page_pos,
758 (int)(data_len - con->out_msg_pos.data_pos)); 832 total_max_write);
833
759 if (crc && !con->out_msg_pos.did_page_crc) { 834 if (crc && !con->out_msg_pos.did_page_crc) {
760 void *base = kaddr + con->out_msg_pos.page_pos; 835 void *base = kaddr + con->out_msg_pos.page_pos;
761 u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc); 836 u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc);
@@ -765,13 +840,14 @@ static int write_partial_msg_pages(struct ceph_connection *con)
765 cpu_to_le32(crc32c(tmpcrc, base, len)); 840 cpu_to_le32(crc32c(tmpcrc, base, len));
766 con->out_msg_pos.did_page_crc = 1; 841 con->out_msg_pos.did_page_crc = 1;
767 } 842 }
768
769 ret = kernel_sendpage(con->sock, page, 843 ret = kernel_sendpage(con->sock, page,
770 con->out_msg_pos.page_pos, len, 844 con->out_msg_pos.page_pos + page_shift,
845 len,
771 MSG_DONTWAIT | MSG_NOSIGNAL | 846 MSG_DONTWAIT | MSG_NOSIGNAL |
772 MSG_MORE); 847 MSG_MORE);
773 848
774 if (crc && (msg->pages || msg->pagelist)) 849 if (crc &&
850 (msg->pages || msg->pagelist || msg->bio || in_trail))
775 kunmap(page); 851 kunmap(page);
776 852
777 if (ret <= 0) 853 if (ret <= 0)
@@ -783,9 +859,16 @@ static int write_partial_msg_pages(struct ceph_connection *con)
783 con->out_msg_pos.page_pos = 0; 859 con->out_msg_pos.page_pos = 0;
784 con->out_msg_pos.page++; 860 con->out_msg_pos.page++;
785 con->out_msg_pos.did_page_crc = 0; 861 con->out_msg_pos.did_page_crc = 0;
786 if (msg->pagelist) 862 if (in_trail)
863 list_move_tail(&page->lru,
864 &msg->trail->head);
865 else if (msg->pagelist)
787 list_move_tail(&page->lru, 866 list_move_tail(&page->lru,
788 &msg->pagelist->head); 867 &msg->pagelist->head);
868#ifdef CONFIG_BLOCK
869 else if (msg->bio)
870 iter_bio_next(&msg->bio_iter, &msg->bio_seg);
871#endif
789 } 872 }
790 } 873 }
791 874
@@ -938,7 +1021,7 @@ static int verify_hello(struct ceph_connection *con)
938{ 1021{
939 if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) { 1022 if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
940 pr_err("connect to %s got bad banner\n", 1023 pr_err("connect to %s got bad banner\n",
941 pr_addr(&con->peer_addr.in_addr)); 1024 ceph_pr_addr(&con->peer_addr.in_addr));
942 con->error_msg = "protocol error, bad banner"; 1025 con->error_msg = "protocol error, bad banner";
943 return -1; 1026 return -1;
944 } 1027 }
@@ -1041,7 +1124,7 @@ int ceph_parse_ips(const char *c, const char *end,
1041 1124
1042 addr_set_port(ss, port); 1125 addr_set_port(ss, port);
1043 1126
1044 dout("parse_ips got %s\n", pr_addr(ss)); 1127 dout("parse_ips got %s\n", ceph_pr_addr(ss));
1045 1128
1046 if (p == end) 1129 if (p == end)
1047 break; 1130 break;
@@ -1061,6 +1144,7 @@ bad:
1061 pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c); 1144 pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c);
1062 return -EINVAL; 1145 return -EINVAL;
1063} 1146}
1147EXPORT_SYMBOL(ceph_parse_ips);
1064 1148
1065static int process_banner(struct ceph_connection *con) 1149static int process_banner(struct ceph_connection *con)
1066{ 1150{
@@ -1082,9 +1166,9 @@ static int process_banner(struct ceph_connection *con)
1082 !(addr_is_blank(&con->actual_peer_addr.in_addr) && 1166 !(addr_is_blank(&con->actual_peer_addr.in_addr) &&
1083 con->actual_peer_addr.nonce == con->peer_addr.nonce)) { 1167 con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
1084 pr_warning("wrong peer, want %s/%d, got %s/%d\n", 1168 pr_warning("wrong peer, want %s/%d, got %s/%d\n",
1085 pr_addr(&con->peer_addr.in_addr), 1169 ceph_pr_addr(&con->peer_addr.in_addr),
1086 (int)le32_to_cpu(con->peer_addr.nonce), 1170 (int)le32_to_cpu(con->peer_addr.nonce),
1087 pr_addr(&con->actual_peer_addr.in_addr), 1171 ceph_pr_addr(&con->actual_peer_addr.in_addr),
1088 (int)le32_to_cpu(con->actual_peer_addr.nonce)); 1172 (int)le32_to_cpu(con->actual_peer_addr.nonce));
1089 con->error_msg = "wrong peer at address"; 1173 con->error_msg = "wrong peer at address";
1090 return -1; 1174 return -1;
@@ -1102,7 +1186,7 @@ static int process_banner(struct ceph_connection *con)
1102 addr_set_port(&con->msgr->inst.addr.in_addr, port); 1186 addr_set_port(&con->msgr->inst.addr.in_addr, port);
1103 encode_my_addr(con->msgr); 1187 encode_my_addr(con->msgr);
1104 dout("process_banner learned my addr is %s\n", 1188 dout("process_banner learned my addr is %s\n",
1105 pr_addr(&con->msgr->inst.addr.in_addr)); 1189 ceph_pr_addr(&con->msgr->inst.addr.in_addr));
1106 } 1190 }
1107 1191
1108 set_bit(NEGOTIATING, &con->state); 1192 set_bit(NEGOTIATING, &con->state);
@@ -1123,8 +1207,8 @@ static void fail_protocol(struct ceph_connection *con)
1123 1207
1124static int process_connect(struct ceph_connection *con) 1208static int process_connect(struct ceph_connection *con)
1125{ 1209{
1126 u64 sup_feat = CEPH_FEATURE_SUPPORTED; 1210 u64 sup_feat = con->msgr->supported_features;
1127 u64 req_feat = CEPH_FEATURE_REQUIRED; 1211 u64 req_feat = con->msgr->required_features;
1128 u64 server_feat = le64_to_cpu(con->in_reply.features); 1212 u64 server_feat = le64_to_cpu(con->in_reply.features);
1129 1213
1130 dout("process_connect on %p tag %d\n", con, (int)con->in_tag); 1214 dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
@@ -1134,7 +1218,7 @@ static int process_connect(struct ceph_connection *con)
1134 pr_err("%s%lld %s feature set mismatch," 1218 pr_err("%s%lld %s feature set mismatch,"
1135 " my %llx < server's %llx, missing %llx\n", 1219 " my %llx < server's %llx, missing %llx\n",
1136 ENTITY_NAME(con->peer_name), 1220 ENTITY_NAME(con->peer_name),
1137 pr_addr(&con->peer_addr.in_addr), 1221 ceph_pr_addr(&con->peer_addr.in_addr),
1138 sup_feat, server_feat, server_feat & ~sup_feat); 1222 sup_feat, server_feat, server_feat & ~sup_feat);
1139 con->error_msg = "missing required protocol features"; 1223 con->error_msg = "missing required protocol features";
1140 fail_protocol(con); 1224 fail_protocol(con);
@@ -1144,7 +1228,7 @@ static int process_connect(struct ceph_connection *con)
1144 pr_err("%s%lld %s protocol version mismatch," 1228 pr_err("%s%lld %s protocol version mismatch,"
1145 " my %d != server's %d\n", 1229 " my %d != server's %d\n",
1146 ENTITY_NAME(con->peer_name), 1230 ENTITY_NAME(con->peer_name),
1147 pr_addr(&con->peer_addr.in_addr), 1231 ceph_pr_addr(&con->peer_addr.in_addr),
1148 le32_to_cpu(con->out_connect.protocol_version), 1232 le32_to_cpu(con->out_connect.protocol_version),
1149 le32_to_cpu(con->in_reply.protocol_version)); 1233 le32_to_cpu(con->in_reply.protocol_version));
1150 con->error_msg = "protocol version mismatch"; 1234 con->error_msg = "protocol version mismatch";
@@ -1178,7 +1262,7 @@ static int process_connect(struct ceph_connection *con)
1178 le32_to_cpu(con->in_connect.connect_seq)); 1262 le32_to_cpu(con->in_connect.connect_seq));
1179 pr_err("%s%lld %s connection reset\n", 1263 pr_err("%s%lld %s connection reset\n",
1180 ENTITY_NAME(con->peer_name), 1264 ENTITY_NAME(con->peer_name),
1181 pr_addr(&con->peer_addr.in_addr)); 1265 ceph_pr_addr(&con->peer_addr.in_addr));
1182 reset_connection(con); 1266 reset_connection(con);
1183 prepare_write_connect(con->msgr, con, 0); 1267 prepare_write_connect(con->msgr, con, 0);
1184 prepare_read_connect(con); 1268 prepare_read_connect(con);
@@ -1223,7 +1307,7 @@ static int process_connect(struct ceph_connection *con)
1223 pr_err("%s%lld %s protocol feature mismatch," 1307 pr_err("%s%lld %s protocol feature mismatch,"
1224 " my required %llx > server's %llx, need %llx\n", 1308 " my required %llx > server's %llx, need %llx\n",
1225 ENTITY_NAME(con->peer_name), 1309 ENTITY_NAME(con->peer_name),
1226 pr_addr(&con->peer_addr.in_addr), 1310 ceph_pr_addr(&con->peer_addr.in_addr),
1227 req_feat, server_feat, req_feat & ~server_feat); 1311 req_feat, server_feat, req_feat & ~server_feat);
1228 con->error_msg = "missing required protocol features"; 1312 con->error_msg = "missing required protocol features";
1229 fail_protocol(con); 1313 fail_protocol(con);
@@ -1305,8 +1389,7 @@ static int read_partial_message_section(struct ceph_connection *con,
1305 struct kvec *section, 1389 struct kvec *section,
1306 unsigned int sec_len, u32 *crc) 1390 unsigned int sec_len, u32 *crc)
1307{ 1391{
1308 int left; 1392 int ret, left;
1309 int ret;
1310 1393
1311 BUG_ON(!section); 1394 BUG_ON(!section);
1312 1395
@@ -1329,13 +1412,83 @@ static int read_partial_message_section(struct ceph_connection *con,
1329static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con, 1412static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
1330 struct ceph_msg_header *hdr, 1413 struct ceph_msg_header *hdr,
1331 int *skip); 1414 int *skip);
1415
1416
1417static int read_partial_message_pages(struct ceph_connection *con,
1418 struct page **pages,
1419 unsigned data_len, int datacrc)
1420{
1421 void *p;
1422 int ret;
1423 int left;
1424
1425 left = min((int)(data_len - con->in_msg_pos.data_pos),
1426 (int)(PAGE_SIZE - con->in_msg_pos.page_pos));
1427 /* (page) data */
1428 BUG_ON(pages == NULL);
1429 p = kmap(pages[con->in_msg_pos.page]);
1430 ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
1431 left);
1432 if (ret > 0 && datacrc)
1433 con->in_data_crc =
1434 crc32c(con->in_data_crc,
1435 p + con->in_msg_pos.page_pos, ret);
1436 kunmap(pages[con->in_msg_pos.page]);
1437 if (ret <= 0)
1438 return ret;
1439 con->in_msg_pos.data_pos += ret;
1440 con->in_msg_pos.page_pos += ret;
1441 if (con->in_msg_pos.page_pos == PAGE_SIZE) {
1442 con->in_msg_pos.page_pos = 0;
1443 con->in_msg_pos.page++;
1444 }
1445
1446 return ret;
1447}
1448
1449#ifdef CONFIG_BLOCK
1450static int read_partial_message_bio(struct ceph_connection *con,
1451 struct bio **bio_iter, int *bio_seg,
1452 unsigned data_len, int datacrc)
1453{
1454 struct bio_vec *bv = bio_iovec_idx(*bio_iter, *bio_seg);
1455 void *p;
1456 int ret, left;
1457
1458 if (IS_ERR(bv))
1459 return PTR_ERR(bv);
1460
1461 left = min((int)(data_len - con->in_msg_pos.data_pos),
1462 (int)(bv->bv_len - con->in_msg_pos.page_pos));
1463
1464 p = kmap(bv->bv_page) + bv->bv_offset;
1465
1466 ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
1467 left);
1468 if (ret > 0 && datacrc)
1469 con->in_data_crc =
1470 crc32c(con->in_data_crc,
1471 p + con->in_msg_pos.page_pos, ret);
1472 kunmap(bv->bv_page);
1473 if (ret <= 0)
1474 return ret;
1475 con->in_msg_pos.data_pos += ret;
1476 con->in_msg_pos.page_pos += ret;
1477 if (con->in_msg_pos.page_pos == bv->bv_len) {
1478 con->in_msg_pos.page_pos = 0;
1479 iter_bio_next(bio_iter, bio_seg);
1480 }
1481
1482 return ret;
1483}
1484#endif
1485
1332/* 1486/*
1333 * read (part of) a message. 1487 * read (part of) a message.
1334 */ 1488 */
1335static int read_partial_message(struct ceph_connection *con) 1489static int read_partial_message(struct ceph_connection *con)
1336{ 1490{
1337 struct ceph_msg *m = con->in_msg; 1491 struct ceph_msg *m = con->in_msg;
1338 void *p;
1339 int ret; 1492 int ret;
1340 int to, left; 1493 int to, left;
1341 unsigned front_len, middle_len, data_len, data_off; 1494 unsigned front_len, middle_len, data_len, data_off;
@@ -1381,7 +1534,7 @@ static int read_partial_message(struct ceph_connection *con)
1381 if ((s64)seq - (s64)con->in_seq < 1) { 1534 if ((s64)seq - (s64)con->in_seq < 1) {
1382 pr_info("skipping %s%lld %s seq %lld, expected %lld\n", 1535 pr_info("skipping %s%lld %s seq %lld, expected %lld\n",
1383 ENTITY_NAME(con->peer_name), 1536 ENTITY_NAME(con->peer_name),
1384 pr_addr(&con->peer_addr.in_addr), 1537 ceph_pr_addr(&con->peer_addr.in_addr),
1385 seq, con->in_seq + 1); 1538 seq, con->in_seq + 1);
1386 con->in_base_pos = -front_len - middle_len - data_len - 1539 con->in_base_pos = -front_len - middle_len - data_len -
1387 sizeof(m->footer); 1540 sizeof(m->footer);
@@ -1422,7 +1575,10 @@ static int read_partial_message(struct ceph_connection *con)
1422 m->middle->vec.iov_len = 0; 1575 m->middle->vec.iov_len = 0;
1423 1576
1424 con->in_msg_pos.page = 0; 1577 con->in_msg_pos.page = 0;
1425 con->in_msg_pos.page_pos = data_off & ~PAGE_MASK; 1578 if (m->pages)
1579 con->in_msg_pos.page_pos = data_off & ~PAGE_MASK;
1580 else
1581 con->in_msg_pos.page_pos = 0;
1426 con->in_msg_pos.data_pos = 0; 1582 con->in_msg_pos.data_pos = 0;
1427 } 1583 }
1428 1584
@@ -1440,27 +1596,29 @@ static int read_partial_message(struct ceph_connection *con)
1440 if (ret <= 0) 1596 if (ret <= 0)
1441 return ret; 1597 return ret;
1442 } 1598 }
1599#ifdef CONFIG_BLOCK
1600 if (m->bio && !m->bio_iter)
1601 init_bio_iter(m->bio, &m->bio_iter, &m->bio_seg);
1602#endif
1443 1603
1444 /* (page) data */ 1604 /* (page) data */
1445 while (con->in_msg_pos.data_pos < data_len) { 1605 while (con->in_msg_pos.data_pos < data_len) {
1446 left = min((int)(data_len - con->in_msg_pos.data_pos), 1606 if (m->pages) {
1447 (int)(PAGE_SIZE - con->in_msg_pos.page_pos)); 1607 ret = read_partial_message_pages(con, m->pages,
1448 BUG_ON(m->pages == NULL); 1608 data_len, datacrc);
1449 p = kmap(m->pages[con->in_msg_pos.page]); 1609 if (ret <= 0)
1450 ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos, 1610 return ret;
1451 left); 1611#ifdef CONFIG_BLOCK
1452 if (ret > 0 && datacrc) 1612 } else if (m->bio) {
1453 con->in_data_crc = 1613
1454 crc32c(con->in_data_crc, 1614 ret = read_partial_message_bio(con,
1455 p + con->in_msg_pos.page_pos, ret); 1615 &m->bio_iter, &m->bio_seg,
1456 kunmap(m->pages[con->in_msg_pos.page]); 1616 data_len, datacrc);
1457 if (ret <= 0) 1617 if (ret <= 0)
1458 return ret; 1618 return ret;
1459 con->in_msg_pos.data_pos += ret; 1619#endif
1460 con->in_msg_pos.page_pos += ret; 1620 } else {
1461 if (con->in_msg_pos.page_pos == PAGE_SIZE) { 1621 BUG_ON(1);
1462 con->in_msg_pos.page_pos = 0;
1463 con->in_msg_pos.page++;
1464 } 1622 }
1465 } 1623 }
1466 1624
@@ -1874,9 +2032,9 @@ out:
1874static void ceph_fault(struct ceph_connection *con) 2032static void ceph_fault(struct ceph_connection *con)
1875{ 2033{
1876 pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name), 2034 pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
1877 pr_addr(&con->peer_addr.in_addr), con->error_msg); 2035 ceph_pr_addr(&con->peer_addr.in_addr), con->error_msg);
1878 dout("fault %p state %lu to peer %s\n", 2036 dout("fault %p state %lu to peer %s\n",
1879 con, con->state, pr_addr(&con->peer_addr.in_addr)); 2037 con, con->state, ceph_pr_addr(&con->peer_addr.in_addr));
1880 2038
1881 if (test_bit(LOSSYTX, &con->state)) { 2039 if (test_bit(LOSSYTX, &con->state)) {
1882 dout("fault on LOSSYTX channel\n"); 2040 dout("fault on LOSSYTX channel\n");
@@ -1936,7 +2094,9 @@ out:
1936/* 2094/*
1937 * create a new messenger instance 2095 * create a new messenger instance
1938 */ 2096 */
1939struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr) 2097struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr,
2098 u32 supported_features,
2099 u32 required_features)
1940{ 2100{
1941 struct ceph_messenger *msgr; 2101 struct ceph_messenger *msgr;
1942 2102
@@ -1944,6 +2104,9 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
1944 if (msgr == NULL) 2104 if (msgr == NULL)
1945 return ERR_PTR(-ENOMEM); 2105 return ERR_PTR(-ENOMEM);
1946 2106
2107 msgr->supported_features = supported_features;
2108 msgr->required_features = required_features;
2109
1947 spin_lock_init(&msgr->global_seq_lock); 2110 spin_lock_init(&msgr->global_seq_lock);
1948 2111
1949 /* the zero page is needed if a request is "canceled" while the message 2112 /* the zero page is needed if a request is "canceled" while the message
@@ -1966,6 +2129,7 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
1966 dout("messenger_create %p\n", msgr); 2129 dout("messenger_create %p\n", msgr);
1967 return msgr; 2130 return msgr;
1968} 2131}
2132EXPORT_SYMBOL(ceph_messenger_create);
1969 2133
1970void ceph_messenger_destroy(struct ceph_messenger *msgr) 2134void ceph_messenger_destroy(struct ceph_messenger *msgr)
1971{ 2135{
@@ -1975,6 +2139,7 @@ void ceph_messenger_destroy(struct ceph_messenger *msgr)
1975 kfree(msgr); 2139 kfree(msgr);
1976 dout("destroyed messenger %p\n", msgr); 2140 dout("destroyed messenger %p\n", msgr);
1977} 2141}
2142EXPORT_SYMBOL(ceph_messenger_destroy);
1978 2143
1979/* 2144/*
1980 * Queue up an outgoing message on the given connection. 2145 * Queue up an outgoing message on the given connection.
@@ -2011,6 +2176,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
2011 if (test_and_set_bit(WRITE_PENDING, &con->state) == 0) 2176 if (test_and_set_bit(WRITE_PENDING, &con->state) == 0)
2012 queue_con(con); 2177 queue_con(con);
2013} 2178}
2179EXPORT_SYMBOL(ceph_con_send);
2014 2180
2015/* 2181/*
2016 * Revoke a message that was previously queued for send 2182 * Revoke a message that was previously queued for send
@@ -2076,6 +2242,7 @@ void ceph_con_keepalive(struct ceph_connection *con)
2076 test_and_set_bit(WRITE_PENDING, &con->state) == 0) 2242 test_and_set_bit(WRITE_PENDING, &con->state) == 0)
2077 queue_con(con); 2243 queue_con(con);
2078} 2244}
2245EXPORT_SYMBOL(ceph_con_keepalive);
2079 2246
2080 2247
2081/* 2248/*
@@ -2136,6 +2303,10 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags)
2136 m->nr_pages = 0; 2303 m->nr_pages = 0;
2137 m->pages = NULL; 2304 m->pages = NULL;
2138 m->pagelist = NULL; 2305 m->pagelist = NULL;
2306 m->bio = NULL;
2307 m->bio_iter = NULL;
2308 m->bio_seg = 0;
2309 m->trail = NULL;
2139 2310
2140 dout("ceph_msg_new %p front %d\n", m, front_len); 2311 dout("ceph_msg_new %p front %d\n", m, front_len);
2141 return m; 2312 return m;
@@ -2146,6 +2317,7 @@ out:
2146 pr_err("msg_new can't create type %d front %d\n", type, front_len); 2317 pr_err("msg_new can't create type %d front %d\n", type, front_len);
2147 return NULL; 2318 return NULL;
2148} 2319}
2320EXPORT_SYMBOL(ceph_msg_new);
2149 2321
2150/* 2322/*
2151 * Allocate "middle" portion of a message, if it is needed and wasn't 2323 * Allocate "middle" portion of a message, if it is needed and wasn't
@@ -2250,11 +2422,14 @@ void ceph_msg_last_put(struct kref *kref)
2250 m->pagelist = NULL; 2422 m->pagelist = NULL;
2251 } 2423 }
2252 2424
2425 m->trail = NULL;
2426
2253 if (m->pool) 2427 if (m->pool)
2254 ceph_msgpool_put(m->pool, m); 2428 ceph_msgpool_put(m->pool, m);
2255 else 2429 else
2256 ceph_msg_kfree(m); 2430 ceph_msg_kfree(m);
2257} 2431}
2432EXPORT_SYMBOL(ceph_msg_last_put);
2258 2433
2259void ceph_msg_dump(struct ceph_msg *msg) 2434void ceph_msg_dump(struct ceph_msg *msg)
2260{ 2435{
@@ -2275,3 +2450,4 @@ void ceph_msg_dump(struct ceph_msg *msg)
2275 DUMP_PREFIX_OFFSET, 16, 1, 2450 DUMP_PREFIX_OFFSET, 16, 1,
2276 &msg->footer, sizeof(msg->footer), true); 2451 &msg->footer, sizeof(msg->footer), true);
2277} 2452}
2453EXPORT_SYMBOL(ceph_msg_dump);
diff --git a/fs/ceph/mon_client.c b/net/ceph/mon_client.c
index b2a5a3e4a671..8a079399174a 100644
--- a/fs/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -1,14 +1,16 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/module.h>
3#include <linux/types.h> 4#include <linux/types.h>
4#include <linux/slab.h> 5#include <linux/slab.h>
5#include <linux/random.h> 6#include <linux/random.h>
6#include <linux/sched.h> 7#include <linux/sched.h>
7 8
8#include "mon_client.h" 9#include <linux/ceph/mon_client.h>
9#include "super.h" 10#include <linux/ceph/libceph.h>
10#include "auth.h" 11#include <linux/ceph/decode.h>
11#include "decode.h" 12
13#include <linux/ceph/auth.h>
12 14
13/* 15/*
14 * Interact with Ceph monitor cluster. Handle requests for new map 16 * Interact with Ceph monitor cluster. Handle requests for new map
@@ -74,7 +76,7 @@ struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
74 m->num_mon); 76 m->num_mon);
75 for (i = 0; i < m->num_mon; i++) 77 for (i = 0; i < m->num_mon; i++)
76 dout("monmap_decode mon%d is %s\n", i, 78 dout("monmap_decode mon%d is %s\n", i,
77 pr_addr(&m->mon_inst[i].addr.in_addr)); 79 ceph_pr_addr(&m->mon_inst[i].addr.in_addr));
78 return m; 80 return m;
79 81
80bad: 82bad:
@@ -191,30 +193,33 @@ static void __send_subscribe(struct ceph_mon_client *monc)
191 struct ceph_msg *msg = monc->m_subscribe; 193 struct ceph_msg *msg = monc->m_subscribe;
192 struct ceph_mon_subscribe_item *i; 194 struct ceph_mon_subscribe_item *i;
193 void *p, *end; 195 void *p, *end;
196 int num;
194 197
195 p = msg->front.iov_base; 198 p = msg->front.iov_base;
196 end = p + msg->front_max; 199 end = p + msg->front_max;
197 200
198 dout("__send_subscribe to 'mdsmap' %u+\n", 201 num = 1 + !!monc->want_next_osdmap + !!monc->want_mdsmap;
199 (unsigned)monc->have_mdsmap); 202 ceph_encode_32(&p, num);
203
200 if (monc->want_next_osdmap) { 204 if (monc->want_next_osdmap) {
201 dout("__send_subscribe to 'osdmap' %u\n", 205 dout("__send_subscribe to 'osdmap' %u\n",
202 (unsigned)monc->have_osdmap); 206 (unsigned)monc->have_osdmap);
203 ceph_encode_32(&p, 3);
204 ceph_encode_string(&p, end, "osdmap", 6); 207 ceph_encode_string(&p, end, "osdmap", 6);
205 i = p; 208 i = p;
206 i->have = cpu_to_le64(monc->have_osdmap); 209 i->have = cpu_to_le64(monc->have_osdmap);
207 i->onetime = 1; 210 i->onetime = 1;
208 p += sizeof(*i); 211 p += sizeof(*i);
209 monc->want_next_osdmap = 2; /* requested */ 212 monc->want_next_osdmap = 2; /* requested */
210 } else {
211 ceph_encode_32(&p, 2);
212 } 213 }
213 ceph_encode_string(&p, end, "mdsmap", 6); 214 if (monc->want_mdsmap) {
214 i = p; 215 dout("__send_subscribe to 'mdsmap' %u+\n",
215 i->have = cpu_to_le64(monc->have_mdsmap); 216 (unsigned)monc->have_mdsmap);
216 i->onetime = 0; 217 ceph_encode_string(&p, end, "mdsmap", 6);
217 p += sizeof(*i); 218 i = p;
219 i->have = cpu_to_le64(monc->have_mdsmap);
220 i->onetime = 0;
221 p += sizeof(*i);
222 }
218 ceph_encode_string(&p, end, "monmap", 6); 223 ceph_encode_string(&p, end, "monmap", 6);
219 i = p; 224 i = p;
220 i->have = 0; 225 i->have = 0;
@@ -243,7 +248,8 @@ static void handle_subscribe_ack(struct ceph_mon_client *monc,
243 mutex_lock(&monc->mutex); 248 mutex_lock(&monc->mutex);
244 if (monc->hunting) { 249 if (monc->hunting) {
245 pr_info("mon%d %s session established\n", 250 pr_info("mon%d %s session established\n",
246 monc->cur_mon, pr_addr(&monc->con->peer_addr.in_addr)); 251 monc->cur_mon,
252 ceph_pr_addr(&monc->con->peer_addr.in_addr));
247 monc->hunting = false; 253 monc->hunting = false;
248 } 254 }
249 dout("handle_subscribe_ack after %d seconds\n", seconds); 255 dout("handle_subscribe_ack after %d seconds\n", seconds);
@@ -266,6 +272,7 @@ int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got)
266 mutex_unlock(&monc->mutex); 272 mutex_unlock(&monc->mutex);
267 return 0; 273 return 0;
268} 274}
275EXPORT_SYMBOL(ceph_monc_got_mdsmap);
269 276
270int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got) 277int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got)
271{ 278{
@@ -310,6 +317,7 @@ int ceph_monc_open_session(struct ceph_mon_client *monc)
310 mutex_unlock(&monc->mutex); 317 mutex_unlock(&monc->mutex);
311 return 0; 318 return 0;
312} 319}
320EXPORT_SYMBOL(ceph_monc_open_session);
313 321
314/* 322/*
315 * The monitor responds with mount ack indicate mount success. The 323 * The monitor responds with mount ack indicate mount success. The
@@ -540,6 +548,7 @@ out:
540 kref_put(&req->kref, release_generic_request); 548 kref_put(&req->kref, release_generic_request);
541 return err; 549 return err;
542} 550}
551EXPORT_SYMBOL(ceph_monc_do_statfs);
543 552
544/* 553/*
545 * pool ops 554 * pool ops
@@ -651,6 +660,7 @@ int ceph_monc_create_snapid(struct ceph_mon_client *monc,
651 pool, 0, (char *)snapid, sizeof(*snapid)); 660 pool, 0, (char *)snapid, sizeof(*snapid));
652 661
653} 662}
663EXPORT_SYMBOL(ceph_monc_create_snapid);
654 664
655int ceph_monc_delete_snapid(struct ceph_mon_client *monc, 665int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
656 u32 pool, u64 snapid) 666 u32 pool, u64 snapid)
@@ -708,9 +718,9 @@ static void delayed_work(struct work_struct *work)
708 */ 718 */
709static int build_initial_monmap(struct ceph_mon_client *monc) 719static int build_initial_monmap(struct ceph_mon_client *monc)
710{ 720{
711 struct ceph_mount_args *args = monc->client->mount_args; 721 struct ceph_options *opt = monc->client->options;
712 struct ceph_entity_addr *mon_addr = args->mon_addr; 722 struct ceph_entity_addr *mon_addr = opt->mon_addr;
713 int num_mon = args->num_mon; 723 int num_mon = opt->num_mon;
714 int i; 724 int i;
715 725
716 /* build initial monmap */ 726 /* build initial monmap */
@@ -728,11 +738,6 @@ static int build_initial_monmap(struct ceph_mon_client *monc)
728 } 738 }
729 monc->monmap->num_mon = num_mon; 739 monc->monmap->num_mon = num_mon;
730 monc->have_fsid = false; 740 monc->have_fsid = false;
731
732 /* release addr memory */
733 kfree(args->mon_addr);
734 args->mon_addr = NULL;
735 args->num_mon = 0;
736 return 0; 741 return 0;
737} 742}
738 743
@@ -753,8 +758,8 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
753 monc->con = NULL; 758 monc->con = NULL;
754 759
755 /* authentication */ 760 /* authentication */
756 monc->auth = ceph_auth_init(cl->mount_args->name, 761 monc->auth = ceph_auth_init(cl->options->name,
757 cl->mount_args->secret); 762 cl->options->secret);
758 if (IS_ERR(monc->auth)) 763 if (IS_ERR(monc->auth))
759 return PTR_ERR(monc->auth); 764 return PTR_ERR(monc->auth);
760 monc->auth->want_keys = 765 monc->auth->want_keys =
@@ -808,6 +813,7 @@ out_monmap:
808out: 813out:
809 return err; 814 return err;
810} 815}
816EXPORT_SYMBOL(ceph_monc_init);
811 817
812void ceph_monc_stop(struct ceph_mon_client *monc) 818void ceph_monc_stop(struct ceph_mon_client *monc)
813{ 819{
@@ -832,6 +838,7 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
832 838
833 kfree(monc->monmap); 839 kfree(monc->monmap);
834} 840}
841EXPORT_SYMBOL(ceph_monc_stop);
835 842
836static void handle_auth_reply(struct ceph_mon_client *monc, 843static void handle_auth_reply(struct ceph_mon_client *monc,
837 struct ceph_msg *msg) 844 struct ceph_msg *msg)
@@ -889,6 +896,7 @@ int ceph_monc_validate_auth(struct ceph_mon_client *monc)
889 mutex_unlock(&monc->mutex); 896 mutex_unlock(&monc->mutex);
890 return ret; 897 return ret;
891} 898}
899EXPORT_SYMBOL(ceph_monc_validate_auth);
892 900
893/* 901/*
894 * handle incoming message 902 * handle incoming message
@@ -922,15 +930,16 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
922 ceph_monc_handle_map(monc, msg); 930 ceph_monc_handle_map(monc, msg);
923 break; 931 break;
924 932
925 case CEPH_MSG_MDS_MAP:
926 ceph_mdsc_handle_map(&monc->client->mdsc, msg);
927 break;
928
929 case CEPH_MSG_OSD_MAP: 933 case CEPH_MSG_OSD_MAP:
930 ceph_osdc_handle_map(&monc->client->osdc, msg); 934 ceph_osdc_handle_map(&monc->client->osdc, msg);
931 break; 935 break;
932 936
933 default: 937 default:
938 /* can the chained handler handle it? */
939 if (monc->client->extra_mon_dispatch &&
940 monc->client->extra_mon_dispatch(monc->client, msg) == 0)
941 break;
942
934 pr_err("received unknown message type %d %s\n", type, 943 pr_err("received unknown message type %d %s\n", type,
935 ceph_msg_type_name(type)); 944 ceph_msg_type_name(type));
936 } 945 }
@@ -994,7 +1003,7 @@ static void mon_fault(struct ceph_connection *con)
994 if (monc->con && !monc->hunting) 1003 if (monc->con && !monc->hunting)
995 pr_info("mon%d %s session lost, " 1004 pr_info("mon%d %s session lost, "
996 "hunting for new mon\n", monc->cur_mon, 1005 "hunting for new mon\n", monc->cur_mon,
997 pr_addr(&monc->con->peer_addr.in_addr)); 1006 ceph_pr_addr(&monc->con->peer_addr.in_addr));
998 1007
999 __close_session(monc); 1008 __close_session(monc);
1000 if (!monc->hunting) { 1009 if (!monc->hunting) {
diff --git a/fs/ceph/msgpool.c b/net/ceph/msgpool.c
index dd65a6438131..d5f2d97ac05c 100644
--- a/fs/ceph/msgpool.c
+++ b/net/ceph/msgpool.c
@@ -1,11 +1,11 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/err.h> 3#include <linux/err.h>
4#include <linux/sched.h> 4#include <linux/sched.h>
5#include <linux/types.h> 5#include <linux/types.h>
6#include <linux/vmalloc.h> 6#include <linux/vmalloc.h>
7 7
8#include "msgpool.h" 8#include <linux/ceph/msgpool.h>
9 9
10static void *alloc_fn(gfp_t gfp_mask, void *arg) 10static void *alloc_fn(gfp_t gfp_mask, void *arg)
11{ 11{
diff --git a/fs/ceph/osd_client.c b/net/ceph/osd_client.c
index 3b5571b8ce22..79391994b3ed 100644
--- a/fs/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1,17 +1,22 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/module.h>
3#include <linux/err.h> 4#include <linux/err.h>
4#include <linux/highmem.h> 5#include <linux/highmem.h>
5#include <linux/mm.h> 6#include <linux/mm.h>
6#include <linux/pagemap.h> 7#include <linux/pagemap.h>
7#include <linux/slab.h> 8#include <linux/slab.h>
8#include <linux/uaccess.h> 9#include <linux/uaccess.h>
10#ifdef CONFIG_BLOCK
11#include <linux/bio.h>
12#endif
9 13
10#include "super.h" 14#include <linux/ceph/libceph.h>
11#include "osd_client.h" 15#include <linux/ceph/osd_client.h>
12#include "messenger.h" 16#include <linux/ceph/messenger.h>
13#include "decode.h" 17#include <linux/ceph/decode.h>
14#include "auth.h" 18#include <linux/ceph/auth.h>
19#include <linux/ceph/pagelist.h>
15 20
16#define OSD_OP_FRONT_LEN 4096 21#define OSD_OP_FRONT_LEN 4096
17#define OSD_OPREPLY_FRONT_LEN 512 22#define OSD_OPREPLY_FRONT_LEN 512
@@ -22,6 +27,59 @@ static int __kick_requests(struct ceph_osd_client *osdc,
22 27
23static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd); 28static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd);
24 29
30static int op_needs_trail(int op)
31{
32 switch (op) {
33 case CEPH_OSD_OP_GETXATTR:
34 case CEPH_OSD_OP_SETXATTR:
35 case CEPH_OSD_OP_CMPXATTR:
36 case CEPH_OSD_OP_CALL:
37 return 1;
38 default:
39 return 0;
40 }
41}
42
43static int op_has_extent(int op)
44{
45 return (op == CEPH_OSD_OP_READ ||
46 op == CEPH_OSD_OP_WRITE);
47}
48
49void ceph_calc_raw_layout(struct ceph_osd_client *osdc,
50 struct ceph_file_layout *layout,
51 u64 snapid,
52 u64 off, u64 *plen, u64 *bno,
53 struct ceph_osd_request *req,
54 struct ceph_osd_req_op *op)
55{
56 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
57 u64 orig_len = *plen;
58 u64 objoff, objlen; /* extent in object */
59
60 reqhead->snapid = cpu_to_le64(snapid);
61
62 /* object extent? */
63 ceph_calc_file_object_mapping(layout, off, plen, bno,
64 &objoff, &objlen);
65 if (*plen < orig_len)
66 dout(" skipping last %llu, final file extent %llu~%llu\n",
67 orig_len - *plen, off, *plen);
68
69 if (op_has_extent(op->op)) {
70 op->extent.offset = objoff;
71 op->extent.length = objlen;
72 }
73 req->r_num_pages = calc_pages_for(off, *plen);
74 if (op->op == CEPH_OSD_OP_WRITE)
75 op->payload_len = *plen;
76
77 dout("calc_layout bno=%llx %llu~%llu (%d pages)\n",
78 *bno, objoff, objlen, req->r_num_pages);
79
80}
81EXPORT_SYMBOL(ceph_calc_raw_layout);
82
25/* 83/*
26 * Implement client access to distributed object storage cluster. 84 * Implement client access to distributed object storage cluster.
27 * 85 *
@@ -48,34 +106,19 @@ static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd);
48 * fill osd op in request message. 106 * fill osd op in request message.
49 */ 107 */
50static void calc_layout(struct ceph_osd_client *osdc, 108static void calc_layout(struct ceph_osd_client *osdc,
51 struct ceph_vino vino, struct ceph_file_layout *layout, 109 struct ceph_vino vino,
110 struct ceph_file_layout *layout,
52 u64 off, u64 *plen, 111 u64 off, u64 *plen,
53 struct ceph_osd_request *req) 112 struct ceph_osd_request *req,
113 struct ceph_osd_req_op *op)
54{ 114{
55 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
56 struct ceph_osd_op *op = (void *)(reqhead + 1);
57 u64 orig_len = *plen;
58 u64 objoff, objlen; /* extent in object */
59 u64 bno; 115 u64 bno;
60 116
61 reqhead->snapid = cpu_to_le64(vino.snap); 117 ceph_calc_raw_layout(osdc, layout, vino.snap, off,
62 118 plen, &bno, req, op);
63 /* object extent? */
64 ceph_calc_file_object_mapping(layout, off, plen, &bno,
65 &objoff, &objlen);
66 if (*plen < orig_len)
67 dout(" skipping last %llu, final file extent %llu~%llu\n",
68 orig_len - *plen, off, *plen);
69 119
70 sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno); 120 sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno);
71 req->r_oid_len = strlen(req->r_oid); 121 req->r_oid_len = strlen(req->r_oid);
72
73 op->extent.offset = cpu_to_le64(objoff);
74 op->extent.length = cpu_to_le64(objlen);
75 req->r_num_pages = calc_pages_for(off, *plen);
76
77 dout("calc_layout %s (%d) %llu~%llu (%d pages)\n",
78 req->r_oid, req->r_oid_len, objoff, objlen, req->r_num_pages);
79} 122}
80 123
81/* 124/*
@@ -101,56 +144,66 @@ void ceph_osdc_release_request(struct kref *kref)
101 if (req->r_own_pages) 144 if (req->r_own_pages)
102 ceph_release_page_vector(req->r_pages, 145 ceph_release_page_vector(req->r_pages,
103 req->r_num_pages); 146 req->r_num_pages);
147#ifdef CONFIG_BLOCK
148 if (req->r_bio)
149 bio_put(req->r_bio);
150#endif
104 ceph_put_snap_context(req->r_snapc); 151 ceph_put_snap_context(req->r_snapc);
152 if (req->r_trail) {
153 ceph_pagelist_release(req->r_trail);
154 kfree(req->r_trail);
155 }
105 if (req->r_mempool) 156 if (req->r_mempool)
106 mempool_free(req, req->r_osdc->req_mempool); 157 mempool_free(req, req->r_osdc->req_mempool);
107 else 158 else
108 kfree(req); 159 kfree(req);
109} 160}
161EXPORT_SYMBOL(ceph_osdc_release_request);
110 162
111/* 163static int get_num_ops(struct ceph_osd_req_op *ops, int *needs_trail)
112 * build new request AND message, calculate layout, and adjust file 164{
113 * extent as needed. 165 int i = 0;
114 * 166
115 * if the file was recently truncated, we include information about its 167 if (needs_trail)
116 * old and new size so that the object can be updated appropriately. (we 168 *needs_trail = 0;
117 * avoid synchronously deleting truncated objects because it's slow.) 169 while (ops[i].op) {
118 * 170 if (needs_trail && op_needs_trail(ops[i].op))
119 * if @do_sync, include a 'startsync' command so that the osd will flush 171 *needs_trail = 1;
120 * data quickly. 172 i++;
121 */ 173 }
122struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, 174
123 struct ceph_file_layout *layout, 175 return i;
124 struct ceph_vino vino, 176}
125 u64 off, u64 *plen, 177
126 int opcode, int flags, 178struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
179 int flags,
127 struct ceph_snap_context *snapc, 180 struct ceph_snap_context *snapc,
128 int do_sync, 181 struct ceph_osd_req_op *ops,
129 u32 truncate_seq, 182 bool use_mempool,
130 u64 truncate_size, 183 gfp_t gfp_flags,
131 struct timespec *mtime, 184 struct page **pages,
132 bool use_mempool, int num_reply) 185 struct bio *bio)
133{ 186{
134 struct ceph_osd_request *req; 187 struct ceph_osd_request *req;
135 struct ceph_msg *msg; 188 struct ceph_msg *msg;
136 struct ceph_osd_request_head *head; 189 int needs_trail;
137 struct ceph_osd_op *op; 190 int num_op = get_num_ops(ops, &needs_trail);
138 void *p; 191 size_t msg_size = sizeof(struct ceph_osd_request_head);
139 int num_op = 1 + do_sync; 192
140 size_t msg_size = sizeof(*head) + num_op*sizeof(*op); 193 msg_size += num_op*sizeof(struct ceph_osd_op);
141 int i;
142 194
143 if (use_mempool) { 195 if (use_mempool) {
144 req = mempool_alloc(osdc->req_mempool, GFP_NOFS); 196 req = mempool_alloc(osdc->req_mempool, gfp_flags);
145 memset(req, 0, sizeof(*req)); 197 memset(req, 0, sizeof(*req));
146 } else { 198 } else {
147 req = kzalloc(sizeof(*req), GFP_NOFS); 199 req = kzalloc(sizeof(*req), gfp_flags);
148 } 200 }
149 if (req == NULL) 201 if (req == NULL)
150 return NULL; 202 return NULL;
151 203
152 req->r_osdc = osdc; 204 req->r_osdc = osdc;
153 req->r_mempool = use_mempool; 205 req->r_mempool = use_mempool;
206
154 kref_init(&req->r_kref); 207 kref_init(&req->r_kref);
155 init_completion(&req->r_completion); 208 init_completion(&req->r_completion);
156 init_completion(&req->r_safe_completion); 209 init_completion(&req->r_safe_completion);
@@ -164,13 +217,22 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
164 msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); 217 msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
165 else 218 else
166 msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, 219 msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
167 OSD_OPREPLY_FRONT_LEN, GFP_NOFS); 220 OSD_OPREPLY_FRONT_LEN, gfp_flags);
168 if (!msg) { 221 if (!msg) {
169 ceph_osdc_put_request(req); 222 ceph_osdc_put_request(req);
170 return NULL; 223 return NULL;
171 } 224 }
172 req->r_reply = msg; 225 req->r_reply = msg;
173 226
227 /* allocate space for the trailing data */
228 if (needs_trail) {
229 req->r_trail = kmalloc(sizeof(struct ceph_pagelist), gfp_flags);
230 if (!req->r_trail) {
231 ceph_osdc_put_request(req);
232 return NULL;
233 }
234 ceph_pagelist_init(req->r_trail);
235 }
174 /* create request message; allow space for oid */ 236 /* create request message; allow space for oid */
175 msg_size += 40; 237 msg_size += 40;
176 if (snapc) 238 if (snapc)
@@ -178,18 +240,115 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
178 if (use_mempool) 240 if (use_mempool)
179 msg = ceph_msgpool_get(&osdc->msgpool_op, 0); 241 msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
180 else 242 else
181 msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, GFP_NOFS); 243 msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, gfp_flags);
182 if (!msg) { 244 if (!msg) {
183 ceph_osdc_put_request(req); 245 ceph_osdc_put_request(req);
184 return NULL; 246 return NULL;
185 } 247 }
248
186 msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP); 249 msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
187 memset(msg->front.iov_base, 0, msg->front.iov_len); 250 memset(msg->front.iov_base, 0, msg->front.iov_len);
251
252 req->r_request = msg;
253 req->r_pages = pages;
254#ifdef CONFIG_BLOCK
255 if (bio) {
256 req->r_bio = bio;
257 bio_get(req->r_bio);
258 }
259#endif
260
261 return req;
262}
263EXPORT_SYMBOL(ceph_osdc_alloc_request);
264
265static void osd_req_encode_op(struct ceph_osd_request *req,
266 struct ceph_osd_op *dst,
267 struct ceph_osd_req_op *src)
268{
269 dst->op = cpu_to_le16(src->op);
270
271 switch (dst->op) {
272 case CEPH_OSD_OP_READ:
273 case CEPH_OSD_OP_WRITE:
274 dst->extent.offset =
275 cpu_to_le64(src->extent.offset);
276 dst->extent.length =
277 cpu_to_le64(src->extent.length);
278 dst->extent.truncate_size =
279 cpu_to_le64(src->extent.truncate_size);
280 dst->extent.truncate_seq =
281 cpu_to_le32(src->extent.truncate_seq);
282 break;
283
284 case CEPH_OSD_OP_GETXATTR:
285 case CEPH_OSD_OP_SETXATTR:
286 case CEPH_OSD_OP_CMPXATTR:
287 BUG_ON(!req->r_trail);
288
289 dst->xattr.name_len = cpu_to_le32(src->xattr.name_len);
290 dst->xattr.value_len = cpu_to_le32(src->xattr.value_len);
291 dst->xattr.cmp_op = src->xattr.cmp_op;
292 dst->xattr.cmp_mode = src->xattr.cmp_mode;
293 ceph_pagelist_append(req->r_trail, src->xattr.name,
294 src->xattr.name_len);
295 ceph_pagelist_append(req->r_trail, src->xattr.val,
296 src->xattr.value_len);
297 break;
298 case CEPH_OSD_OP_CALL:
299 BUG_ON(!req->r_trail);
300
301 dst->cls.class_len = src->cls.class_len;
302 dst->cls.method_len = src->cls.method_len;
303 dst->cls.indata_len = cpu_to_le32(src->cls.indata_len);
304
305 ceph_pagelist_append(req->r_trail, src->cls.class_name,
306 src->cls.class_len);
307 ceph_pagelist_append(req->r_trail, src->cls.method_name,
308 src->cls.method_len);
309 ceph_pagelist_append(req->r_trail, src->cls.indata,
310 src->cls.indata_len);
311 break;
312 case CEPH_OSD_OP_ROLLBACK:
313 dst->snap.snapid = cpu_to_le64(src->snap.snapid);
314 break;
315 case CEPH_OSD_OP_STARTSYNC:
316 break;
317 default:
318 pr_err("unrecognized osd opcode %d\n", dst->op);
319 WARN_ON(1);
320 break;
321 }
322 dst->payload_len = cpu_to_le32(src->payload_len);
323}
324
325/*
326 * build new request AND message
327 *
328 */
329void ceph_osdc_build_request(struct ceph_osd_request *req,
330 u64 off, u64 *plen,
331 struct ceph_osd_req_op *src_ops,
332 struct ceph_snap_context *snapc,
333 struct timespec *mtime,
334 const char *oid,
335 int oid_len)
336{
337 struct ceph_msg *msg = req->r_request;
338 struct ceph_osd_request_head *head;
339 struct ceph_osd_req_op *src_op;
340 struct ceph_osd_op *op;
341 void *p;
342 int num_op = get_num_ops(src_ops, NULL);
343 size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
344 int flags = req->r_flags;
345 u64 data_len = 0;
346 int i;
347
188 head = msg->front.iov_base; 348 head = msg->front.iov_base;
189 op = (void *)(head + 1); 349 op = (void *)(head + 1);
190 p = (void *)(op + num_op); 350 p = (void *)(op + num_op);
191 351
192 req->r_request = msg;
193 req->r_snapc = ceph_get_snap_context(snapc); 352 req->r_snapc = ceph_get_snap_context(snapc);
194 353
195 head->client_inc = cpu_to_le32(1); /* always, for now. */ 354 head->client_inc = cpu_to_le32(1); /* always, for now. */
@@ -197,29 +356,23 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
197 if (flags & CEPH_OSD_FLAG_WRITE) 356 if (flags & CEPH_OSD_FLAG_WRITE)
198 ceph_encode_timespec(&head->mtime, mtime); 357 ceph_encode_timespec(&head->mtime, mtime);
199 head->num_ops = cpu_to_le16(num_op); 358 head->num_ops = cpu_to_le16(num_op);
200 op->op = cpu_to_le16(opcode);
201 359
202 /* calculate max write size */
203 calc_layout(osdc, vino, layout, off, plen, req);
204 req->r_file_layout = *layout; /* keep a copy */
205
206 if (flags & CEPH_OSD_FLAG_WRITE) {
207 req->r_request->hdr.data_off = cpu_to_le16(off);
208 req->r_request->hdr.data_len = cpu_to_le32(*plen);
209 op->payload_len = cpu_to_le32(*plen);
210 }
211 op->extent.truncate_size = cpu_to_le64(truncate_size);
212 op->extent.truncate_seq = cpu_to_le32(truncate_seq);
213 360
214 /* fill in oid */ 361 /* fill in oid */
215 head->object_len = cpu_to_le32(req->r_oid_len); 362 head->object_len = cpu_to_le32(oid_len);
216 memcpy(p, req->r_oid, req->r_oid_len); 363 memcpy(p, oid, oid_len);
217 p += req->r_oid_len; 364 p += oid_len;
218 365
219 if (do_sync) { 366 src_op = src_ops;
367 while (src_op->op) {
368 osd_req_encode_op(req, op, src_op);
369 src_op++;
220 op++; 370 op++;
221 op->op = cpu_to_le16(CEPH_OSD_OP_STARTSYNC);
222 } 371 }
372
373 if (req->r_trail)
374 data_len += req->r_trail->length;
375
223 if (snapc) { 376 if (snapc) {
224 head->snap_seq = cpu_to_le64(snapc->seq); 377 head->snap_seq = cpu_to_le64(snapc->seq);
225 head->num_snaps = cpu_to_le32(snapc->num_snaps); 378 head->num_snaps = cpu_to_le32(snapc->num_snaps);
@@ -229,12 +382,79 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
229 } 382 }
230 } 383 }
231 384
385 if (flags & CEPH_OSD_FLAG_WRITE) {
386 req->r_request->hdr.data_off = cpu_to_le16(off);
387 req->r_request->hdr.data_len = cpu_to_le32(*plen + data_len);
388 } else if (data_len) {
389 req->r_request->hdr.data_off = 0;
390 req->r_request->hdr.data_len = cpu_to_le32(data_len);
391 }
392
232 BUG_ON(p > msg->front.iov_base + msg->front.iov_len); 393 BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
233 msg_size = p - msg->front.iov_base; 394 msg_size = p - msg->front.iov_base;
234 msg->front.iov_len = msg_size; 395 msg->front.iov_len = msg_size;
235 msg->hdr.front_len = cpu_to_le32(msg_size); 396 msg->hdr.front_len = cpu_to_le32(msg_size);
397 return;
398}
399EXPORT_SYMBOL(ceph_osdc_build_request);
400
401/*
402 * build new request AND message, calculate layout, and adjust file
403 * extent as needed.
404 *
405 * if the file was recently truncated, we include information about its
406 * old and new size so that the object can be updated appropriately. (we
407 * avoid synchronously deleting truncated objects because it's slow.)
408 *
409 * if @do_sync, include a 'startsync' command so that the osd will flush
410 * data quickly.
411 */
412struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
413 struct ceph_file_layout *layout,
414 struct ceph_vino vino,
415 u64 off, u64 *plen,
416 int opcode, int flags,
417 struct ceph_snap_context *snapc,
418 int do_sync,
419 u32 truncate_seq,
420 u64 truncate_size,
421 struct timespec *mtime,
422 bool use_mempool, int num_reply)
423{
424 struct ceph_osd_req_op ops[3];
425 struct ceph_osd_request *req;
426
427 ops[0].op = opcode;
428 ops[0].extent.truncate_seq = truncate_seq;
429 ops[0].extent.truncate_size = truncate_size;
430 ops[0].payload_len = 0;
431
432 if (do_sync) {
433 ops[1].op = CEPH_OSD_OP_STARTSYNC;
434 ops[1].payload_len = 0;
435 ops[2].op = 0;
436 } else
437 ops[1].op = 0;
438
439 req = ceph_osdc_alloc_request(osdc, flags,
440 snapc, ops,
441 use_mempool,
442 GFP_NOFS, NULL, NULL);
443 if (IS_ERR(req))
444 return req;
445
446 /* calculate max write size */
447 calc_layout(osdc, vino, layout, off, plen, req, ops);
448 req->r_file_layout = *layout; /* keep a copy */
449
450 ceph_osdc_build_request(req, off, plen, ops,
451 snapc,
452 mtime,
453 req->r_oid, req->r_oid_len);
454
236 return req; 455 return req;
237} 456}
457EXPORT_SYMBOL(ceph_osdc_new_request);
238 458
239/* 459/*
240 * We keep osd requests in an rbtree, sorted by ->r_tid. 460 * We keep osd requests in an rbtree, sorted by ->r_tid.
@@ -389,7 +609,7 @@ static void __move_osd_to_lru(struct ceph_osd_client *osdc,
389 dout("__move_osd_to_lru %p\n", osd); 609 dout("__move_osd_to_lru %p\n", osd);
390 BUG_ON(!list_empty(&osd->o_osd_lru)); 610 BUG_ON(!list_empty(&osd->o_osd_lru));
391 list_add_tail(&osd->o_osd_lru, &osdc->osd_lru); 611 list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
392 osd->lru_ttl = jiffies + osdc->client->mount_args->osd_idle_ttl * HZ; 612 osd->lru_ttl = jiffies + osdc->client->options->osd_idle_ttl * HZ;
393} 613}
394 614
395static void __remove_osd_from_lru(struct ceph_osd *osd) 615static void __remove_osd_from_lru(struct ceph_osd *osd)
@@ -483,7 +703,7 @@ static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
483static void __schedule_osd_timeout(struct ceph_osd_client *osdc) 703static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
484{ 704{
485 schedule_delayed_work(&osdc->timeout_work, 705 schedule_delayed_work(&osdc->timeout_work,
486 osdc->client->mount_args->osd_keepalive_timeout * HZ); 706 osdc->client->options->osd_keepalive_timeout * HZ);
487} 707}
488 708
489static void __cancel_osd_timeout(struct ceph_osd_client *osdc) 709static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
@@ -684,9 +904,9 @@ static void handle_timeout(struct work_struct *work)
684 container_of(work, struct ceph_osd_client, timeout_work.work); 904 container_of(work, struct ceph_osd_client, timeout_work.work);
685 struct ceph_osd_request *req, *last_req = NULL; 905 struct ceph_osd_request *req, *last_req = NULL;
686 struct ceph_osd *osd; 906 struct ceph_osd *osd;
687 unsigned long timeout = osdc->client->mount_args->osd_timeout * HZ; 907 unsigned long timeout = osdc->client->options->osd_timeout * HZ;
688 unsigned long keepalive = 908 unsigned long keepalive =
689 osdc->client->mount_args->osd_keepalive_timeout * HZ; 909 osdc->client->options->osd_keepalive_timeout * HZ;
690 unsigned long last_stamp = 0; 910 unsigned long last_stamp = 0;
691 struct rb_node *p; 911 struct rb_node *p;
692 struct list_head slow_osds; 912 struct list_head slow_osds;
@@ -773,7 +993,7 @@ static void handle_osds_timeout(struct work_struct *work)
773 container_of(work, struct ceph_osd_client, 993 container_of(work, struct ceph_osd_client,
774 osds_timeout_work.work); 994 osds_timeout_work.work);
775 unsigned long delay = 995 unsigned long delay =
776 osdc->client->mount_args->osd_idle_ttl * HZ >> 2; 996 osdc->client->options->osd_idle_ttl * HZ >> 2;
777 997
778 dout("osds timeout\n"); 998 dout("osds timeout\n");
779 down_read(&osdc->map_sem); 999 down_read(&osdc->map_sem);
@@ -1104,6 +1324,10 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc,
1104 1324
1105 req->r_request->pages = req->r_pages; 1325 req->r_request->pages = req->r_pages;
1106 req->r_request->nr_pages = req->r_num_pages; 1326 req->r_request->nr_pages = req->r_num_pages;
1327#ifdef CONFIG_BLOCK
1328 req->r_request->bio = req->r_bio;
1329#endif
1330 req->r_request->trail = req->r_trail;
1107 1331
1108 register_request(osdc, req); 1332 register_request(osdc, req);
1109 1333
@@ -1131,6 +1355,7 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc,
1131 up_read(&osdc->map_sem); 1355 up_read(&osdc->map_sem);
1132 return rc; 1356 return rc;
1133} 1357}
1358EXPORT_SYMBOL(ceph_osdc_start_request);
1134 1359
1135/* 1360/*
1136 * wait for a request to complete 1361 * wait for a request to complete
@@ -1153,6 +1378,7 @@ int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
1153 dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result); 1378 dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result);
1154 return req->r_result; 1379 return req->r_result;
1155} 1380}
1381EXPORT_SYMBOL(ceph_osdc_wait_request);
1156 1382
1157/* 1383/*
1158 * sync - wait for all in-flight requests to flush. avoid starvation. 1384 * sync - wait for all in-flight requests to flush. avoid starvation.
@@ -1186,6 +1412,7 @@ void ceph_osdc_sync(struct ceph_osd_client *osdc)
1186 mutex_unlock(&osdc->request_mutex); 1412 mutex_unlock(&osdc->request_mutex);
1187 dout("sync done (thru tid %llu)\n", last_tid); 1413 dout("sync done (thru tid %llu)\n", last_tid);
1188} 1414}
1415EXPORT_SYMBOL(ceph_osdc_sync);
1189 1416
1190/* 1417/*
1191 * init, shutdown 1418 * init, shutdown
@@ -1211,7 +1438,7 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
1211 INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout); 1438 INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
1212 1439
1213 schedule_delayed_work(&osdc->osds_timeout_work, 1440 schedule_delayed_work(&osdc->osds_timeout_work,
1214 round_jiffies_relative(osdc->client->mount_args->osd_idle_ttl * HZ)); 1441 round_jiffies_relative(osdc->client->options->osd_idle_ttl * HZ));
1215 1442
1216 err = -ENOMEM; 1443 err = -ENOMEM;
1217 osdc->req_mempool = mempool_create_kmalloc_pool(10, 1444 osdc->req_mempool = mempool_create_kmalloc_pool(10,
@@ -1237,6 +1464,7 @@ out_mempool:
1237out: 1464out:
1238 return err; 1465 return err;
1239} 1466}
1467EXPORT_SYMBOL(ceph_osdc_init);
1240 1468
1241void ceph_osdc_stop(struct ceph_osd_client *osdc) 1469void ceph_osdc_stop(struct ceph_osd_client *osdc)
1242{ 1470{
@@ -1251,6 +1479,7 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc)
1251 ceph_msgpool_destroy(&osdc->msgpool_op); 1479 ceph_msgpool_destroy(&osdc->msgpool_op);
1252 ceph_msgpool_destroy(&osdc->msgpool_op_reply); 1480 ceph_msgpool_destroy(&osdc->msgpool_op_reply);
1253} 1481}
1482EXPORT_SYMBOL(ceph_osdc_stop);
1254 1483
1255/* 1484/*
1256 * Read some contiguous pages. If we cross a stripe boundary, shorten 1485 * Read some contiguous pages. If we cross a stripe boundary, shorten
@@ -1288,6 +1517,7 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
1288 dout("readpages result %d\n", rc); 1517 dout("readpages result %d\n", rc);
1289 return rc; 1518 return rc;
1290} 1519}
1520EXPORT_SYMBOL(ceph_osdc_readpages);
1291 1521
1292/* 1522/*
1293 * do a synchronous write on N pages 1523 * do a synchronous write on N pages
@@ -1330,6 +1560,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
1330 dout("writepages result %d\n", rc); 1560 dout("writepages result %d\n", rc);
1331 return rc; 1561 return rc;
1332} 1562}
1563EXPORT_SYMBOL(ceph_osdc_writepages);
1333 1564
1334/* 1565/*
1335 * handle incoming message 1566 * handle incoming message
@@ -1420,6 +1651,9 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
1420 } 1651 }
1421 m->pages = req->r_pages; 1652 m->pages = req->r_pages;
1422 m->nr_pages = req->r_num_pages; 1653 m->nr_pages = req->r_num_pages;
1654#ifdef CONFIG_BLOCK
1655 m->bio = req->r_bio;
1656#endif
1423 } 1657 }
1424 *skip = 0; 1658 *skip = 0;
1425 req->r_con_filling_msg = ceph_con_get(con); 1659 req->r_con_filling_msg = ceph_con_get(con);
diff --git a/fs/ceph/osdmap.c b/net/ceph/osdmap.c
index e31f118f1392..d73f3f6efa36 100644
--- a/fs/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1,14 +1,15 @@
1 1
2#include "ceph_debug.h" 2#include <linux/ceph/ceph_debug.h>
3 3
4#include <linux/module.h>
4#include <linux/slab.h> 5#include <linux/slab.h>
5#include <asm/div64.h> 6#include <asm/div64.h>
6 7
7#include "super.h" 8#include <linux/ceph/libceph.h>
8#include "osdmap.h" 9#include <linux/ceph/osdmap.h>
9#include "crush/hash.h" 10#include <linux/ceph/decode.h>
10#include "crush/mapper.h" 11#include <linux/crush/hash.h>
11#include "decode.h" 12#include <linux/crush/mapper.h>
12 13
13char *ceph_osdmap_state_str(char *str, int len, int state) 14char *ceph_osdmap_state_str(char *str, int len, int state)
14{ 15{
@@ -417,6 +418,20 @@ static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
417 return NULL; 418 return NULL;
418} 419}
419 420
421int ceph_pg_poolid_by_name(struct ceph_osdmap *map, const char *name)
422{
423 struct rb_node *rbp;
424
425 for (rbp = rb_first(&map->pg_pools); rbp; rbp = rb_next(rbp)) {
426 struct ceph_pg_pool_info *pi =
427 rb_entry(rbp, struct ceph_pg_pool_info, node);
428 if (pi->name && strcmp(pi->name, name) == 0)
429 return pi->id;
430 }
431 return -ENOENT;
432}
433EXPORT_SYMBOL(ceph_pg_poolid_by_name);
434
420static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi) 435static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi)
421{ 436{
422 rb_erase(&pi->node, root); 437 rb_erase(&pi->node, root);
@@ -966,6 +981,7 @@ void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
966 981
967 dout(" obj extent %llu~%llu\n", *oxoff, *oxlen); 982 dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
968} 983}
984EXPORT_SYMBOL(ceph_calc_file_object_mapping);
969 985
970/* 986/*
971 * calculate an object layout (i.e. pgid) from an oid, 987 * calculate an object layout (i.e. pgid) from an oid,
@@ -1011,6 +1027,7 @@ int ceph_calc_object_layout(struct ceph_object_layout *ol,
1011 ol->ol_stripe_unit = fl->fl_object_stripe_unit; 1027 ol->ol_stripe_unit = fl->fl_object_stripe_unit;
1012 return 0; 1028 return 0;
1013} 1029}
1030EXPORT_SYMBOL(ceph_calc_object_layout);
1014 1031
1015/* 1032/*
1016 * Calculate raw osd vector for the given pgid. Return pointer to osd 1033 * Calculate raw osd vector for the given pgid. Return pointer to osd
@@ -1108,3 +1125,4 @@ int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
1108 return osds[i]; 1125 return osds[i];
1109 return -1; 1126 return -1;
1110} 1127}
1128EXPORT_SYMBOL(ceph_calc_pg_primary);
diff --git a/net/ceph/pagelist.c b/net/ceph/pagelist.c
new file mode 100644
index 000000000000..13cb409a7bba
--- /dev/null
+++ b/net/ceph/pagelist.c
@@ -0,0 +1,154 @@
1
2#include <linux/module.h>
3#include <linux/gfp.h>
4#include <linux/pagemap.h>
5#include <linux/highmem.h>
6#include <linux/ceph/pagelist.h>
7
8static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl)
9{
10 if (pl->mapped_tail) {
11 struct page *page = list_entry(pl->head.prev, struct page, lru);
12 kunmap(page);
13 pl->mapped_tail = NULL;
14 }
15}
16
17int ceph_pagelist_release(struct ceph_pagelist *pl)
18{
19 ceph_pagelist_unmap_tail(pl);
20 while (!list_empty(&pl->head)) {
21 struct page *page = list_first_entry(&pl->head, struct page,
22 lru);
23 list_del(&page->lru);
24 __free_page(page);
25 }
26 ceph_pagelist_free_reserve(pl);
27 return 0;
28}
29EXPORT_SYMBOL(ceph_pagelist_release);
30
31static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
32{
33 struct page *page;
34
35 if (!pl->num_pages_free) {
36 page = __page_cache_alloc(GFP_NOFS);
37 } else {
38 page = list_first_entry(&pl->free_list, struct page, lru);
39 list_del(&page->lru);
40 --pl->num_pages_free;
41 }
42 if (!page)
43 return -ENOMEM;
44 pl->room += PAGE_SIZE;
45 ceph_pagelist_unmap_tail(pl);
46 list_add_tail(&page->lru, &pl->head);
47 pl->mapped_tail = kmap(page);
48 return 0;
49}
50
51int ceph_pagelist_append(struct ceph_pagelist *pl, const void *buf, size_t len)
52{
53 while (pl->room < len) {
54 size_t bit = pl->room;
55 int ret;
56
57 memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK),
58 buf, bit);
59 pl->length += bit;
60 pl->room -= bit;
61 buf += bit;
62 len -= bit;
63 ret = ceph_pagelist_addpage(pl);
64 if (ret)
65 return ret;
66 }
67
68 memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len);
69 pl->length += len;
70 pl->room -= len;
71 return 0;
72}
73EXPORT_SYMBOL(ceph_pagelist_append);
74
75/**
76 * Allocate enough pages for a pagelist to append the given amount
77 * of data without without allocating.
78 * Returns: 0 on success, -ENOMEM on error.
79 */
80int ceph_pagelist_reserve(struct ceph_pagelist *pl, size_t space)
81{
82 if (space <= pl->room)
83 return 0;
84 space -= pl->room;
85 space = (space + PAGE_SIZE - 1) >> PAGE_SHIFT; /* conv to num pages */
86
87 while (space > pl->num_pages_free) {
88 struct page *page = __page_cache_alloc(GFP_NOFS);
89 if (!page)
90 return -ENOMEM;
91 list_add_tail(&page->lru, &pl->free_list);
92 ++pl->num_pages_free;
93 }
94 return 0;
95}
96EXPORT_SYMBOL(ceph_pagelist_reserve);
97
98/**
99 * Free any pages that have been preallocated.
100 */
101int ceph_pagelist_free_reserve(struct ceph_pagelist *pl)
102{
103 while (!list_empty(&pl->free_list)) {
104 struct page *page = list_first_entry(&pl->free_list,
105 struct page, lru);
106 list_del(&page->lru);
107 __free_page(page);
108 --pl->num_pages_free;
109 }
110 BUG_ON(pl->num_pages_free);
111 return 0;
112}
113EXPORT_SYMBOL(ceph_pagelist_free_reserve);
114
115/**
116 * Create a truncation point.
117 */
118void ceph_pagelist_set_cursor(struct ceph_pagelist *pl,
119 struct ceph_pagelist_cursor *c)
120{
121 c->pl = pl;
122 c->page_lru = pl->head.prev;
123 c->room = pl->room;
124}
125EXPORT_SYMBOL(ceph_pagelist_set_cursor);
126
127/**
128 * Truncate a pagelist to the given point. Move extra pages to reserve.
129 * This won't sleep.
130 * Returns: 0 on success,
131 * -EINVAL if the pagelist doesn't match the trunc point pagelist
132 */
133int ceph_pagelist_truncate(struct ceph_pagelist *pl,
134 struct ceph_pagelist_cursor *c)
135{
136 struct page *page;
137
138 if (pl != c->pl)
139 return -EINVAL;
140 ceph_pagelist_unmap_tail(pl);
141 while (pl->head.prev != c->page_lru) {
142 page = list_entry(pl->head.prev, struct page, lru);
143 list_del(&page->lru); /* remove from pagelist */
144 list_add_tail(&page->lru, &pl->free_list); /* add to reserve */
145 ++pl->num_pages_free;
146 }
147 pl->room = c->room;
148 if (!list_empty(&pl->head)) {
149 page = list_entry(pl->head.prev, struct page, lru);
150 pl->mapped_tail = kmap(page);
151 }
152 return 0;
153}
154EXPORT_SYMBOL(ceph_pagelist_truncate);
diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c
new file mode 100644
index 000000000000..54caf0687155
--- /dev/null
+++ b/net/ceph/pagevec.c
@@ -0,0 +1,223 @@
1#include <linux/ceph/ceph_debug.h>
2
3#include <linux/module.h>
4#include <linux/sched.h>
5#include <linux/slab.h>
6#include <linux/file.h>
7#include <linux/namei.h>
8#include <linux/writeback.h>
9
10#include <linux/ceph/libceph.h>
11
12/*
13 * build a vector of user pages
14 */
15struct page **ceph_get_direct_page_vector(const char __user *data,
16 int num_pages,
17 loff_t off, size_t len)
18{
19 struct page **pages;
20 int rc;
21
22 pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
23 if (!pages)
24 return ERR_PTR(-ENOMEM);
25
26 down_read(&current->mm->mmap_sem);
27 rc = get_user_pages(current, current->mm, (unsigned long)data,
28 num_pages, 0, 0, pages, NULL);
29 up_read(&current->mm->mmap_sem);
30 if (rc < 0)
31 goto fail;
32 return pages;
33
34fail:
35 kfree(pages);
36 return ERR_PTR(rc);
37}
38EXPORT_SYMBOL(ceph_get_direct_page_vector);
39
40void ceph_put_page_vector(struct page **pages, int num_pages)
41{
42 int i;
43
44 for (i = 0; i < num_pages; i++)
45 put_page(pages[i]);
46 kfree(pages);
47}
48EXPORT_SYMBOL(ceph_put_page_vector);
49
50void ceph_release_page_vector(struct page **pages, int num_pages)
51{
52 int i;
53
54 for (i = 0; i < num_pages; i++)
55 __free_pages(pages[i], 0);
56 kfree(pages);
57}
58EXPORT_SYMBOL(ceph_release_page_vector);
59
60/*
61 * allocate a vector new pages
62 */
63struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags)
64{
65 struct page **pages;
66 int i;
67
68 pages = kmalloc(sizeof(*pages) * num_pages, flags);
69 if (!pages)
70 return ERR_PTR(-ENOMEM);
71 for (i = 0; i < num_pages; i++) {
72 pages[i] = __page_cache_alloc(flags);
73 if (pages[i] == NULL) {
74 ceph_release_page_vector(pages, i);
75 return ERR_PTR(-ENOMEM);
76 }
77 }
78 return pages;
79}
80EXPORT_SYMBOL(ceph_alloc_page_vector);
81
82/*
83 * copy user data into a page vector
84 */
85int ceph_copy_user_to_page_vector(struct page **pages,
86 const char __user *data,
87 loff_t off, size_t len)
88{
89 int i = 0;
90 int po = off & ~PAGE_CACHE_MASK;
91 int left = len;
92 int l, bad;
93
94 while (left > 0) {
95 l = min_t(int, PAGE_CACHE_SIZE-po, left);
96 bad = copy_from_user(page_address(pages[i]) + po, data, l);
97 if (bad == l)
98 return -EFAULT;
99 data += l - bad;
100 left -= l - bad;
101 po += l - bad;
102 if (po == PAGE_CACHE_SIZE) {
103 po = 0;
104 i++;
105 }
106 }
107 return len;
108}
109EXPORT_SYMBOL(ceph_copy_user_to_page_vector);
110
111int ceph_copy_to_page_vector(struct page **pages,
112 const char *data,
113 loff_t off, size_t len)
114{
115 int i = 0;
116 size_t po = off & ~PAGE_CACHE_MASK;
117 size_t left = len;
118 size_t l;
119
120 while (left > 0) {
121 l = min_t(size_t, PAGE_CACHE_SIZE-po, left);
122 memcpy(page_address(pages[i]) + po, data, l);
123 data += l;
124 left -= l;
125 po += l;
126 if (po == PAGE_CACHE_SIZE) {
127 po = 0;
128 i++;
129 }
130 }
131 return len;
132}
133EXPORT_SYMBOL(ceph_copy_to_page_vector);
134
135int ceph_copy_from_page_vector(struct page **pages,
136 char *data,
137 loff_t off, size_t len)
138{
139 int i = 0;
140 size_t po = off & ~PAGE_CACHE_MASK;
141 size_t left = len;
142 size_t l;
143
144 while (left > 0) {
145 l = min_t(size_t, PAGE_CACHE_SIZE-po, left);
146 memcpy(data, page_address(pages[i]) + po, l);
147 data += l;
148 left -= l;
149 po += l;
150 if (po == PAGE_CACHE_SIZE) {
151 po = 0;
152 i++;
153 }
154 }
155 return len;
156}
157EXPORT_SYMBOL(ceph_copy_from_page_vector);
158
159/*
160 * copy user data from a page vector into a user pointer
161 */
162int ceph_copy_page_vector_to_user(struct page **pages,
163 char __user *data,
164 loff_t off, size_t len)
165{
166 int i = 0;
167 int po = off & ~PAGE_CACHE_MASK;
168 int left = len;
169 int l, bad;
170
171 while (left > 0) {
172 l = min_t(int, left, PAGE_CACHE_SIZE-po);
173 bad = copy_to_user(data, page_address(pages[i]) + po, l);
174 if (bad == l)
175 return -EFAULT;
176 data += l - bad;
177 left -= l - bad;
178 if (po) {
179 po += l - bad;
180 if (po == PAGE_CACHE_SIZE)
181 po = 0;
182 }
183 i++;
184 }
185 return len;
186}
187EXPORT_SYMBOL(ceph_copy_page_vector_to_user);
188
189/*
190 * Zero an extent within a page vector. Offset is relative to the
191 * start of the first page.
192 */
193void ceph_zero_page_vector_range(int off, int len, struct page **pages)
194{
195 int i = off >> PAGE_CACHE_SHIFT;
196
197 off &= ~PAGE_CACHE_MASK;
198
199 dout("zero_page_vector_page %u~%u\n", off, len);
200
201 /* leading partial page? */
202 if (off) {
203 int end = min((int)PAGE_CACHE_SIZE, off + len);
204 dout("zeroing %d %p head from %d\n", i, pages[i],
205 (int)off);
206 zero_user_segment(pages[i], off, end);
207 len -= (end - off);
208 i++;
209 }
210 while (len >= PAGE_CACHE_SIZE) {
211 dout("zeroing %d %p len=%d\n", i, pages[i], len);
212 zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
213 len -= PAGE_CACHE_SIZE;
214 i++;
215 }
216 /* trailing partial page? */
217 if (len) {
218 dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len);
219 zero_user_segment(pages[i], 0, len);
220 }
221}
222EXPORT_SYMBOL(ceph_zero_page_vector_range);
223
diff --git a/net/core/sock.c b/net/core/sock.c
index ef30e9d286e7..7d99e13148e6 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1078,8 +1078,11 @@ static void sk_prot_free(struct proto *prot, struct sock *sk)
1078#ifdef CONFIG_CGROUPS 1078#ifdef CONFIG_CGROUPS
1079void sock_update_classid(struct sock *sk) 1079void sock_update_classid(struct sock *sk)
1080{ 1080{
1081 u32 classid = task_cls_classid(current); 1081 u32 classid;
1082 1082
1083 rcu_read_lock(); /* doing current task, which cannot vanish. */
1084 classid = task_cls_classid(current);
1085 rcu_read_unlock();
1083 if (classid && classid != sk->sk_classid) 1086 if (classid && classid != sk->sk_classid)
1084 sk->sk_classid = classid; 1087 sk->sk_classid = classid;
1085} 1088}
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
index 244f7cb08d68..37f8adb68c79 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -11,6 +11,7 @@
11#include <linux/proc_fs.h> 11#include <linux/proc_fs.h>
12#include <linux/seq_file.h> 12#include <linux/seq_file.h>
13#include <linux/percpu.h> 13#include <linux/percpu.h>
14#include <linux/security.h>
14#include <net/net_namespace.h> 15#include <net/net_namespace.h>
15 16
16#include <linux/netfilter.h> 17#include <linux/netfilter.h>
@@ -87,6 +88,29 @@ static void ct_seq_stop(struct seq_file *s, void *v)
87 rcu_read_unlock(); 88 rcu_read_unlock();
88} 89}
89 90
91#ifdef CONFIG_NF_CONNTRACK_SECMARK
92static int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
93{
94 int ret;
95 u32 len;
96 char *secctx;
97
98 ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
99 if (ret)
100 return ret;
101
102 ret = seq_printf(s, "secctx=%s ", secctx);
103
104 security_release_secctx(secctx, len);
105 return ret;
106}
107#else
108static inline int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
109{
110 return 0;
111}
112#endif
113
90static int ct_seq_show(struct seq_file *s, void *v) 114static int ct_seq_show(struct seq_file *s, void *v)
91{ 115{
92 struct nf_conntrack_tuple_hash *hash = v; 116 struct nf_conntrack_tuple_hash *hash = v;
@@ -148,10 +172,8 @@ static int ct_seq_show(struct seq_file *s, void *v)
148 goto release; 172 goto release;
149#endif 173#endif
150 174
151#ifdef CONFIG_NF_CONNTRACK_SECMARK 175 if (ct_show_secctx(s, ct))
152 if (seq_printf(s, "secmark=%u ", ct->secmark))
153 goto release; 176 goto release;
154#endif
155 177
156 if (seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use))) 178 if (seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use)))
157 goto release; 179 goto release;
diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c
index 8c8632d9b93c..957c9241fb0c 100644
--- a/net/ipv4/netfilter/nf_nat_core.c
+++ b/net/ipv4/netfilter/nf_nat_core.c
@@ -38,7 +38,7 @@ static DEFINE_SPINLOCK(nf_nat_lock);
38static struct nf_conntrack_l3proto *l3proto __read_mostly; 38static struct nf_conntrack_l3proto *l3proto __read_mostly;
39 39
40#define MAX_IP_NAT_PROTO 256 40#define MAX_IP_NAT_PROTO 256
41static const struct nf_nat_protocol *nf_nat_protos[MAX_IP_NAT_PROTO] 41static const struct nf_nat_protocol __rcu *nf_nat_protos[MAX_IP_NAT_PROTO]
42 __read_mostly; 42 __read_mostly;
43 43
44static inline const struct nf_nat_protocol * 44static inline const struct nf_nat_protocol *
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 78b505d33bfb..fdaec7daff1d 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -27,7 +27,7 @@
27 27
28static DEFINE_MUTEX(afinfo_mutex); 28static DEFINE_MUTEX(afinfo_mutex);
29 29
30const struct nf_afinfo *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly; 30const struct nf_afinfo __rcu *nf_afinfo[NFPROTO_NUMPROTO] __read_mostly;
31EXPORT_SYMBOL(nf_afinfo); 31EXPORT_SYMBOL(nf_afinfo);
32 32
33int nf_register_afinfo(const struct nf_afinfo *afinfo) 33int nf_register_afinfo(const struct nf_afinfo *afinfo)
diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
index cdcc7649476b..5702de35e2bb 100644
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -26,10 +26,10 @@
26 26
27static DEFINE_MUTEX(nf_ct_ecache_mutex); 27static DEFINE_MUTEX(nf_ct_ecache_mutex);
28 28
29struct nf_ct_event_notifier *nf_conntrack_event_cb __read_mostly; 29struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb __read_mostly;
30EXPORT_SYMBOL_GPL(nf_conntrack_event_cb); 30EXPORT_SYMBOL_GPL(nf_conntrack_event_cb);
31 31
32struct nf_exp_event_notifier *nf_expect_event_cb __read_mostly; 32struct nf_exp_event_notifier __rcu *nf_expect_event_cb __read_mostly;
33EXPORT_SYMBOL_GPL(nf_expect_event_cb); 33EXPORT_SYMBOL_GPL(nf_expect_event_cb);
34 34
35/* deliver cached events and clear cache entry - must be called with locally 35/* deliver cached events and clear cache entry - must be called with locally
diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c
index 8d9e4c949b96..bd82450c193f 100644
--- a/net/netfilter/nf_conntrack_extend.c
+++ b/net/netfilter/nf_conntrack_extend.c
@@ -16,7 +16,7 @@
16#include <linux/skbuff.h> 16#include <linux/skbuff.h>
17#include <net/netfilter/nf_conntrack_extend.h> 17#include <net/netfilter/nf_conntrack_extend.h>
18 18
19static struct nf_ct_ext_type *nf_ct_ext_types[NF_CT_EXT_NUM]; 19static struct nf_ct_ext_type __rcu *nf_ct_ext_types[NF_CT_EXT_NUM];
20static DEFINE_MUTEX(nf_ct_ext_type_mutex); 20static DEFINE_MUTEX(nf_ct_ext_type_mutex);
21 21
22void __nf_ct_ext_destroy(struct nf_conn *ct) 22void __nf_ct_ext_destroy(struct nf_conn *ct)
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 5bae1cd15eea..146476c6441a 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -22,6 +22,7 @@
22#include <linux/rculist_nulls.h> 22#include <linux/rculist_nulls.h>
23#include <linux/types.h> 23#include <linux/types.h>
24#include <linux/timer.h> 24#include <linux/timer.h>
25#include <linux/security.h>
25#include <linux/skbuff.h> 26#include <linux/skbuff.h>
26#include <linux/errno.h> 27#include <linux/errno.h>
27#include <linux/netlink.h> 28#include <linux/netlink.h>
@@ -245,16 +246,31 @@ nla_put_failure:
245 246
246#ifdef CONFIG_NF_CONNTRACK_SECMARK 247#ifdef CONFIG_NF_CONNTRACK_SECMARK
247static inline int 248static inline int
248ctnetlink_dump_secmark(struct sk_buff *skb, const struct nf_conn *ct) 249ctnetlink_dump_secctx(struct sk_buff *skb, const struct nf_conn *ct)
249{ 250{
250 NLA_PUT_BE32(skb, CTA_SECMARK, htonl(ct->secmark)); 251 struct nlattr *nest_secctx;
251 return 0; 252 int len, ret;
253 char *secctx;
254
255 ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
256 if (ret)
257 return ret;
258
259 ret = -1;
260 nest_secctx = nla_nest_start(skb, CTA_SECCTX | NLA_F_NESTED);
261 if (!nest_secctx)
262 goto nla_put_failure;
263
264 NLA_PUT_STRING(skb, CTA_SECCTX_NAME, secctx);
265 nla_nest_end(skb, nest_secctx);
252 266
267 ret = 0;
253nla_put_failure: 268nla_put_failure:
254 return -1; 269 security_release_secctx(secctx, len);
270 return ret;
255} 271}
256#else 272#else
257#define ctnetlink_dump_secmark(a, b) (0) 273#define ctnetlink_dump_secctx(a, b) (0)
258#endif 274#endif
259 275
260#define master_tuple(ct) &(ct->master->tuplehash[IP_CT_DIR_ORIGINAL].tuple) 276#define master_tuple(ct) &(ct->master->tuplehash[IP_CT_DIR_ORIGINAL].tuple)
@@ -391,7 +407,7 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
391 ctnetlink_dump_protoinfo(skb, ct) < 0 || 407 ctnetlink_dump_protoinfo(skb, ct) < 0 ||
392 ctnetlink_dump_helpinfo(skb, ct) < 0 || 408 ctnetlink_dump_helpinfo(skb, ct) < 0 ||
393 ctnetlink_dump_mark(skb, ct) < 0 || 409 ctnetlink_dump_mark(skb, ct) < 0 ||
394 ctnetlink_dump_secmark(skb, ct) < 0 || 410 ctnetlink_dump_secctx(skb, ct) < 0 ||
395 ctnetlink_dump_id(skb, ct) < 0 || 411 ctnetlink_dump_id(skb, ct) < 0 ||
396 ctnetlink_dump_use(skb, ct) < 0 || 412 ctnetlink_dump_use(skb, ct) < 0 ||
397 ctnetlink_dump_master(skb, ct) < 0 || 413 ctnetlink_dump_master(skb, ct) < 0 ||
@@ -437,6 +453,17 @@ ctnetlink_counters_size(const struct nf_conn *ct)
437 ; 453 ;
438} 454}
439 455
456#ifdef CONFIG_NF_CONNTRACK_SECMARK
457static int ctnetlink_nlmsg_secctx_size(const struct nf_conn *ct)
458{
459 int len;
460
461 security_secid_to_secctx(ct->secmark, NULL, &len);
462
463 return sizeof(char) * len;
464}
465#endif
466
440static inline size_t 467static inline size_t
441ctnetlink_nlmsg_size(const struct nf_conn *ct) 468ctnetlink_nlmsg_size(const struct nf_conn *ct)
442{ 469{
@@ -453,7 +480,8 @@ ctnetlink_nlmsg_size(const struct nf_conn *ct)
453 + nla_total_size(0) /* CTA_HELP */ 480 + nla_total_size(0) /* CTA_HELP */
454 + nla_total_size(NF_CT_HELPER_NAME_LEN) /* CTA_HELP_NAME */ 481 + nla_total_size(NF_CT_HELPER_NAME_LEN) /* CTA_HELP_NAME */
455#ifdef CONFIG_NF_CONNTRACK_SECMARK 482#ifdef CONFIG_NF_CONNTRACK_SECMARK
456 + nla_total_size(sizeof(u_int32_t)) /* CTA_SECMARK */ 483 + nla_total_size(0) /* CTA_SECCTX */
484 + nla_total_size(ctnetlink_nlmsg_secctx_size(ct)) /* CTA_SECCTX_NAME */
457#endif 485#endif
458#ifdef CONFIG_NF_NAT_NEEDED 486#ifdef CONFIG_NF_NAT_NEEDED
459 + 2 * nla_total_size(0) /* CTA_NAT_SEQ_ADJ_ORIG|REPL */ 487 + 2 * nla_total_size(0) /* CTA_NAT_SEQ_ADJ_ORIG|REPL */
@@ -556,7 +584,7 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item)
556 584
557#ifdef CONFIG_NF_CONNTRACK_SECMARK 585#ifdef CONFIG_NF_CONNTRACK_SECMARK
558 if ((events & (1 << IPCT_SECMARK) || ct->secmark) 586 if ((events & (1 << IPCT_SECMARK) || ct->secmark)
559 && ctnetlink_dump_secmark(skb, ct) < 0) 587 && ctnetlink_dump_secctx(skb, ct) < 0)
560 goto nla_put_failure; 588 goto nla_put_failure;
561#endif 589#endif
562 590
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index 5886ba1d52a0..ed6d92958023 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -28,8 +28,8 @@
28#include <net/netfilter/nf_conntrack_l4proto.h> 28#include <net/netfilter/nf_conntrack_l4proto.h>
29#include <net/netfilter/nf_conntrack_core.h> 29#include <net/netfilter/nf_conntrack_core.h>
30 30
31static struct nf_conntrack_l4proto **nf_ct_protos[PF_MAX] __read_mostly; 31static struct nf_conntrack_l4proto __rcu **nf_ct_protos[PF_MAX] __read_mostly;
32struct nf_conntrack_l3proto *nf_ct_l3protos[AF_MAX] __read_mostly; 32struct nf_conntrack_l3proto __rcu *nf_ct_l3protos[AF_MAX] __read_mostly;
33EXPORT_SYMBOL_GPL(nf_ct_l3protos); 33EXPORT_SYMBOL_GPL(nf_ct_l3protos);
34 34
35static DEFINE_MUTEX(nf_ct_proto_mutex); 35static DEFINE_MUTEX(nf_ct_proto_mutex);
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index eb973fcd67ab..0fb65705b44b 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -15,6 +15,7 @@
15#include <linux/seq_file.h> 15#include <linux/seq_file.h>
16#include <linux/percpu.h> 16#include <linux/percpu.h>
17#include <linux/netdevice.h> 17#include <linux/netdevice.h>
18#include <linux/security.h>
18#include <net/net_namespace.h> 19#include <net/net_namespace.h>
19#ifdef CONFIG_SYSCTL 20#ifdef CONFIG_SYSCTL
20#include <linux/sysctl.h> 21#include <linux/sysctl.h>
@@ -108,6 +109,29 @@ static void ct_seq_stop(struct seq_file *s, void *v)
108 rcu_read_unlock(); 109 rcu_read_unlock();
109} 110}
110 111
112#ifdef CONFIG_NF_CONNTRACK_SECMARK
113static int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
114{
115 int ret;
116 u32 len;
117 char *secctx;
118
119 ret = security_secid_to_secctx(ct->secmark, &secctx, &len);
120 if (ret)
121 return ret;
122
123 ret = seq_printf(s, "secctx=%s ", secctx);
124
125 security_release_secctx(secctx, len);
126 return ret;
127}
128#else
129static inline int ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
130{
131 return 0;
132}
133#endif
134
111/* return 0 on success, 1 in case of error */ 135/* return 0 on success, 1 in case of error */
112static int ct_seq_show(struct seq_file *s, void *v) 136static int ct_seq_show(struct seq_file *s, void *v)
113{ 137{
@@ -168,10 +192,8 @@ static int ct_seq_show(struct seq_file *s, void *v)
168 goto release; 192 goto release;
169#endif 193#endif
170 194
171#ifdef CONFIG_NF_CONNTRACK_SECMARK 195 if (ct_show_secctx(s, ct))
172 if (seq_printf(s, "secmark=%u ", ct->secmark))
173 goto release; 196 goto release;
174#endif
175 197
176#ifdef CONFIG_NF_CONNTRACK_ZONES 198#ifdef CONFIG_NF_CONNTRACK_ZONES
177 if (seq_printf(s, "zone=%u ", nf_ct_zone(ct))) 199 if (seq_printf(s, "zone=%u ", nf_ct_zone(ct)))
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index 7df37fd786bc..b07393eab88e 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -16,7 +16,7 @@
16#define NF_LOG_PREFIXLEN 128 16#define NF_LOG_PREFIXLEN 128
17#define NFLOGGER_NAME_LEN 64 17#define NFLOGGER_NAME_LEN 64
18 18
19static const struct nf_logger *nf_loggers[NFPROTO_NUMPROTO] __read_mostly; 19static const struct nf_logger __rcu *nf_loggers[NFPROTO_NUMPROTO] __read_mostly;
20static struct list_head nf_loggers_l[NFPROTO_NUMPROTO] __read_mostly; 20static struct list_head nf_loggers_l[NFPROTO_NUMPROTO] __read_mostly;
21static DEFINE_MUTEX(nf_log_mutex); 21static DEFINE_MUTEX(nf_log_mutex);
22 22
diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c
index 78b3cf9c519c..74aebed5bd28 100644
--- a/net/netfilter/nf_queue.c
+++ b/net/netfilter/nf_queue.c
@@ -18,7 +18,7 @@
18 * long term mutex. The handler must provide an an outfn() to accept packets 18 * long term mutex. The handler must provide an an outfn() to accept packets
19 * for queueing and must reinject all packets it receives, no matter what. 19 * for queueing and must reinject all packets it receives, no matter what.
20 */ 20 */
21static const struct nf_queue_handler *queue_handler[NFPROTO_NUMPROTO] __read_mostly; 21static const struct nf_queue_handler __rcu *queue_handler[NFPROTO_NUMPROTO] __read_mostly;
22 22
23static DEFINE_MUTEX(queue_handler_mutex); 23static DEFINE_MUTEX(queue_handler_mutex);
24 24
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index 0cb6053f02fd..782e51986a6f 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -9,7 +9,6 @@
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/gfp.h> 10#include <linux/gfp.h>
11#include <linux/skbuff.h> 11#include <linux/skbuff.h>
12#include <linux/selinux.h>
13#include <linux/netfilter_ipv4/ip_tables.h> 12#include <linux/netfilter_ipv4/ip_tables.h>
14#include <linux/netfilter_ipv6/ip6_tables.h> 13#include <linux/netfilter_ipv6/ip6_tables.h>
15#include <linux/netfilter/x_tables.h> 14#include <linux/netfilter/x_tables.h>
diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c
index 23b2d6c486b5..9faf5e050b79 100644
--- a/net/netfilter/xt_SECMARK.c
+++ b/net/netfilter/xt_SECMARK.c
@@ -14,8 +14,8 @@
14 */ 14 */
15#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 15#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/security.h>
17#include <linux/skbuff.h> 18#include <linux/skbuff.h>
18#include <linux/selinux.h>
19#include <linux/netfilter/x_tables.h> 19#include <linux/netfilter/x_tables.h>
20#include <linux/netfilter/xt_SECMARK.h> 20#include <linux/netfilter/xt_SECMARK.h>
21 21
@@ -39,9 +39,8 @@ secmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
39 39
40 switch (mode) { 40 switch (mode) {
41 case SECMARK_MODE_SEL: 41 case SECMARK_MODE_SEL:
42 secmark = info->u.sel.selsid; 42 secmark = info->secid;
43 break; 43 break;
44
45 default: 44 default:
46 BUG(); 45 BUG();
47 } 46 }
@@ -50,33 +49,33 @@ secmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
50 return XT_CONTINUE; 49 return XT_CONTINUE;
51} 50}
52 51
53static int checkentry_selinux(struct xt_secmark_target_info *info) 52static int checkentry_lsm(struct xt_secmark_target_info *info)
54{ 53{
55 int err; 54 int err;
56 struct xt_secmark_target_selinux_info *sel = &info->u.sel;
57 55
58 sel->selctx[SECMARK_SELCTX_MAX - 1] = '\0'; 56 info->secctx[SECMARK_SECCTX_MAX - 1] = '\0';
57 info->secid = 0;
59 58
60 err = selinux_string_to_sid(sel->selctx, &sel->selsid); 59 err = security_secctx_to_secid(info->secctx, strlen(info->secctx),
60 &info->secid);
61 if (err) { 61 if (err) {
62 if (err == -EINVAL) 62 if (err == -EINVAL)
63 pr_info("invalid SELinux context \'%s\'\n", 63 pr_info("invalid security context \'%s\'\n", info->secctx);
64 sel->selctx);
65 return err; 64 return err;
66 } 65 }
67 66
68 if (!sel->selsid) { 67 if (!info->secid) {
69 pr_info("unable to map SELinux context \'%s\'\n", sel->selctx); 68 pr_info("unable to map security context \'%s\'\n", info->secctx);
70 return -ENOENT; 69 return -ENOENT;
71 } 70 }
72 71
73 err = selinux_secmark_relabel_packet_permission(sel->selsid); 72 err = security_secmark_relabel_packet(info->secid);
74 if (err) { 73 if (err) {
75 pr_info("unable to obtain relabeling permission\n"); 74 pr_info("unable to obtain relabeling permission\n");
76 return err; 75 return err;
77 } 76 }
78 77
79 selinux_secmark_refcount_inc(); 78 security_secmark_refcount_inc();
80 return 0; 79 return 0;
81} 80}
82 81
@@ -100,16 +99,16 @@ static int secmark_tg_check(const struct xt_tgchk_param *par)
100 99
101 switch (info->mode) { 100 switch (info->mode) {
102 case SECMARK_MODE_SEL: 101 case SECMARK_MODE_SEL:
103 err = checkentry_selinux(info);
104 if (err <= 0)
105 return err;
106 break; 102 break;
107
108 default: 103 default:
109 pr_info("invalid mode: %hu\n", info->mode); 104 pr_info("invalid mode: %hu\n", info->mode);
110 return -EINVAL; 105 return -EINVAL;
111 } 106 }
112 107
108 err = checkentry_lsm(info);
109 if (err)
110 return err;
111
113 if (!mode) 112 if (!mode)
114 mode = info->mode; 113 mode = info->mode;
115 return 0; 114 return 0;
@@ -119,7 +118,7 @@ static void secmark_tg_destroy(const struct xt_tgdtor_param *par)
119{ 118{
120 switch (mode) { 119 switch (mode) {
121 case SECMARK_MODE_SEL: 120 case SECMARK_MODE_SEL:
122 selinux_secmark_refcount_dec(); 121 security_secmark_refcount_dec();
123 } 122 }
124} 123}
125 124
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index 78ef2c5e130b..37dff78e9cb1 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -123,7 +123,7 @@ static int cls_cgroup_classify(struct sk_buff *skb, struct tcf_proto *tp,
123 * calls by looking at the number of nested bh disable calls because 123 * calls by looking at the number of nested bh disable calls because
124 * softirqs always disables bh. 124 * softirqs always disables bh.
125 */ 125 */
126 if (softirq_count() != SOFTIRQ_OFFSET) { 126 if (in_serving_softirq()) {
127 /* If there is an sk_classid we'll use that. */ 127 /* If there is an sk_classid we'll use that. */
128 if (!skb->sk) 128 if (!skb->sk)
129 return -1; 129 return -1;
diff --git a/security/apparmor/.gitignore b/security/apparmor/.gitignore
index 0a0a99f3b083..4d995aeaebc0 100644
--- a/security/apparmor/.gitignore
+++ b/security/apparmor/.gitignore
@@ -3,3 +3,4 @@
3# 3#
4af_names.h 4af_names.h
5capability_names.h 5capability_names.h
6rlim_names.h
diff --git a/security/apparmor/apparmorfs.c b/security/apparmor/apparmorfs.c
index 7320331b44ab..544ff5837cb6 100644
--- a/security/apparmor/apparmorfs.c
+++ b/security/apparmor/apparmorfs.c
@@ -29,7 +29,7 @@
29 * aa_simple_write_to_buffer - common routine for getting policy from user 29 * aa_simple_write_to_buffer - common routine for getting policy from user
30 * @op: operation doing the user buffer copy 30 * @op: operation doing the user buffer copy
31 * @userbuf: user buffer to copy data from (NOT NULL) 31 * @userbuf: user buffer to copy data from (NOT NULL)
32 * @alloc_size: size of user buffer 32 * @alloc_size: size of user buffer (REQUIRES: @alloc_size >= @copy_size)
33 * @copy_size: size of data to copy from user buffer 33 * @copy_size: size of data to copy from user buffer
34 * @pos: position write is at in the file (NOT NULL) 34 * @pos: position write is at in the file (NOT NULL)
35 * 35 *
@@ -42,6 +42,8 @@ static char *aa_simple_write_to_buffer(int op, const char __user *userbuf,
42{ 42{
43 char *data; 43 char *data;
44 44
45 BUG_ON(copy_size > alloc_size);
46
45 if (*pos != 0) 47 if (*pos != 0)
46 /* only writes from pos 0, that is complete writes */ 48 /* only writes from pos 0, that is complete writes */
47 return ERR_PTR(-ESPIPE); 49 return ERR_PTR(-ESPIPE);
diff --git a/security/capability.c b/security/capability.c
index 95a6599a37bb..30ae00fbecd5 100644
--- a/security/capability.c
+++ b/security/capability.c
@@ -677,7 +677,18 @@ static void cap_inet_conn_established(struct sock *sk, struct sk_buff *skb)
677{ 677{
678} 678}
679 679
680static int cap_secmark_relabel_packet(u32 secid)
681{
682 return 0;
683}
680 684
685static void cap_secmark_refcount_inc(void)
686{
687}
688
689static void cap_secmark_refcount_dec(void)
690{
691}
681 692
682static void cap_req_classify_flow(const struct request_sock *req, 693static void cap_req_classify_flow(const struct request_sock *req,
683 struct flowi *fl) 694 struct flowi *fl)
@@ -777,7 +788,8 @@ static int cap_secid_to_secctx(u32 secid, char **secdata, u32 *seclen)
777 788
778static int cap_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid) 789static int cap_secctx_to_secid(const char *secdata, u32 seclen, u32 *secid)
779{ 790{
780 return -EOPNOTSUPP; 791 *secid = 0;
792 return 0;
781} 793}
782 794
783static void cap_release_secctx(char *secdata, u32 seclen) 795static void cap_release_secctx(char *secdata, u32 seclen)
@@ -1018,6 +1030,9 @@ void __init security_fixup_ops(struct security_operations *ops)
1018 set_to_cap_if_null(ops, inet_conn_request); 1030 set_to_cap_if_null(ops, inet_conn_request);
1019 set_to_cap_if_null(ops, inet_csk_clone); 1031 set_to_cap_if_null(ops, inet_csk_clone);
1020 set_to_cap_if_null(ops, inet_conn_established); 1032 set_to_cap_if_null(ops, inet_conn_established);
1033 set_to_cap_if_null(ops, secmark_relabel_packet);
1034 set_to_cap_if_null(ops, secmark_refcount_inc);
1035 set_to_cap_if_null(ops, secmark_refcount_dec);
1021 set_to_cap_if_null(ops, req_classify_flow); 1036 set_to_cap_if_null(ops, req_classify_flow);
1022 set_to_cap_if_null(ops, tun_dev_create); 1037 set_to_cap_if_null(ops, tun_dev_create);
1023 set_to_cap_if_null(ops, tun_dev_post_create); 1038 set_to_cap_if_null(ops, tun_dev_post_create);
diff --git a/security/commoncap.c b/security/commoncap.c
index 9d172e6e330c..5e632b4857e4 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -719,14 +719,11 @@ static int cap_safe_nice(struct task_struct *p)
719/** 719/**
720 * cap_task_setscheduler - Detemine if scheduler policy change is permitted 720 * cap_task_setscheduler - Detemine if scheduler policy change is permitted
721 * @p: The task to affect 721 * @p: The task to affect
722 * @policy: The policy to effect
723 * @lp: The parameters to the scheduling policy
724 * 722 *
725 * Detemine if the requested scheduler policy change is permitted for the 723 * Detemine if the requested scheduler policy change is permitted for the
726 * specified task, returning 0 if permission is granted, -ve if denied. 724 * specified task, returning 0 if permission is granted, -ve if denied.
727 */ 725 */
728int cap_task_setscheduler(struct task_struct *p, int policy, 726int cap_task_setscheduler(struct task_struct *p)
729 struct sched_param *lp)
730{ 727{
731 return cap_safe_nice(p); 728 return cap_safe_nice(p);
732} 729}
diff --git a/security/security.c b/security/security.c
index c53949f17d9e..b50f472061a4 100644
--- a/security/security.c
+++ b/security/security.c
@@ -89,20 +89,12 @@ __setup("security=", choose_lsm);
89 * Return true if: 89 * Return true if:
90 * -The passed LSM is the one chosen by user at boot time, 90 * -The passed LSM is the one chosen by user at boot time,
91 * -or the passed LSM is configured as the default and the user did not 91 * -or the passed LSM is configured as the default and the user did not
92 * choose an alternate LSM at boot time, 92 * choose an alternate LSM at boot time.
93 * -or there is no default LSM set and the user didn't specify a
94 * specific LSM and we're the first to ask for registration permission,
95 * -or the passed LSM is currently loaded.
96 * Otherwise, return false. 93 * Otherwise, return false.
97 */ 94 */
98int __init security_module_enable(struct security_operations *ops) 95int __init security_module_enable(struct security_operations *ops)
99{ 96{
100 if (!*chosen_lsm) 97 return !strcmp(ops->name, chosen_lsm);
101 strncpy(chosen_lsm, ops->name, SECURITY_NAME_MAX);
102 else if (strncmp(ops->name, chosen_lsm, SECURITY_NAME_MAX))
103 return 0;
104
105 return 1;
106} 98}
107 99
108/** 100/**
@@ -786,10 +778,9 @@ int security_task_setrlimit(struct task_struct *p, unsigned int resource,
786 return security_ops->task_setrlimit(p, resource, new_rlim); 778 return security_ops->task_setrlimit(p, resource, new_rlim);
787} 779}
788 780
789int security_task_setscheduler(struct task_struct *p, 781int security_task_setscheduler(struct task_struct *p)
790 int policy, struct sched_param *lp)
791{ 782{
792 return security_ops->task_setscheduler(p, policy, lp); 783 return security_ops->task_setscheduler(p);
793} 784}
794 785
795int security_task_getscheduler(struct task_struct *p) 786int security_task_getscheduler(struct task_struct *p)
@@ -1145,6 +1136,24 @@ void security_inet_conn_established(struct sock *sk,
1145 security_ops->inet_conn_established(sk, skb); 1136 security_ops->inet_conn_established(sk, skb);
1146} 1137}
1147 1138
1139int security_secmark_relabel_packet(u32 secid)
1140{
1141 return security_ops->secmark_relabel_packet(secid);
1142}
1143EXPORT_SYMBOL(security_secmark_relabel_packet);
1144
1145void security_secmark_refcount_inc(void)
1146{
1147 security_ops->secmark_refcount_inc();
1148}
1149EXPORT_SYMBOL(security_secmark_refcount_inc);
1150
1151void security_secmark_refcount_dec(void)
1152{
1153 security_ops->secmark_refcount_dec();
1154}
1155EXPORT_SYMBOL(security_secmark_refcount_dec);
1156
1148int security_tun_dev_create(void) 1157int security_tun_dev_create(void)
1149{ 1158{
1150 return security_ops->tun_dev_create(); 1159 return security_ops->tun_dev_create();
diff --git a/security/selinux/Makefile b/security/selinux/Makefile
index 58d80f3bd6f6..ad5cd76ec231 100644
--- a/security/selinux/Makefile
+++ b/security/selinux/Makefile
@@ -2,25 +2,20 @@
2# Makefile for building the SELinux module as part of the kernel tree. 2# Makefile for building the SELinux module as part of the kernel tree.
3# 3#
4 4
5obj-$(CONFIG_SECURITY_SELINUX) := selinux.o ss/ 5obj-$(CONFIG_SECURITY_SELINUX) := selinux.o
6 6
7selinux-y := avc.o \ 7selinux-y := avc.o hooks.o selinuxfs.o netlink.o nlmsgtab.o netif.o \
8 hooks.o \ 8 netnode.o netport.o exports.o \
9 selinuxfs.o \ 9 ss/ebitmap.o ss/hashtab.o ss/symtab.o ss/sidtab.o ss/avtab.o \
10 netlink.o \ 10 ss/policydb.o ss/services.o ss/conditional.o ss/mls.o ss/status.o
11 nlmsgtab.o \
12 netif.o \
13 netnode.o \
14 netport.o \
15 exports.o
16 11
17selinux-$(CONFIG_SECURITY_NETWORK_XFRM) += xfrm.o 12selinux-$(CONFIG_SECURITY_NETWORK_XFRM) += xfrm.o
18 13
19selinux-$(CONFIG_NETLABEL) += netlabel.o 14selinux-$(CONFIG_NETLABEL) += netlabel.o
20 15
21EXTRA_CFLAGS += -Isecurity/selinux -Isecurity/selinux/include 16ccflags-y := -Isecurity/selinux -Isecurity/selinux/include
22 17
23$(obj)/avc.o: $(obj)/flask.h 18$(addprefix $(obj)/,$(selinux-y)): $(obj)/flask.h
24 19
25quiet_cmd_flask = GEN $(obj)/flask.h $(obj)/av_permissions.h 20quiet_cmd_flask = GEN $(obj)/flask.h $(obj)/av_permissions.h
26 cmd_flask = scripts/selinux/genheaders/genheaders $(obj)/flask.h $(obj)/av_permissions.h 21 cmd_flask = scripts/selinux/genheaders/genheaders $(obj)/flask.h $(obj)/av_permissions.h
diff --git a/security/selinux/exports.c b/security/selinux/exports.c
index c0a454aee1e0..90664385dead 100644
--- a/security/selinux/exports.c
+++ b/security/selinux/exports.c
@@ -11,58 +11,9 @@
11 * it under the terms of the GNU General Public License version 2, 11 * it under the terms of the GNU General Public License version 2,
12 * as published by the Free Software Foundation. 12 * as published by the Free Software Foundation.
13 */ 13 */
14#include <linux/types.h>
15#include <linux/kernel.h>
16#include <linux/module.h> 14#include <linux/module.h>
17#include <linux/selinux.h>
18#include <linux/fs.h>
19#include <linux/ipc.h>
20#include <asm/atomic.h>
21 15
22#include "security.h" 16#include "security.h"
23#include "objsec.h"
24
25/* SECMARK reference count */
26extern atomic_t selinux_secmark_refcount;
27
28int selinux_string_to_sid(char *str, u32 *sid)
29{
30 if (selinux_enabled)
31 return security_context_to_sid(str, strlen(str), sid);
32 else {
33 *sid = 0;
34 return 0;
35 }
36}
37EXPORT_SYMBOL_GPL(selinux_string_to_sid);
38
39int selinux_secmark_relabel_packet_permission(u32 sid)
40{
41 if (selinux_enabled) {
42 const struct task_security_struct *__tsec;
43 u32 tsid;
44
45 __tsec = current_security();
46 tsid = __tsec->sid;
47
48 return avc_has_perm(tsid, sid, SECCLASS_PACKET,
49 PACKET__RELABELTO, NULL);
50 }
51 return 0;
52}
53EXPORT_SYMBOL_GPL(selinux_secmark_relabel_packet_permission);
54
55void selinux_secmark_refcount_inc(void)
56{
57 atomic_inc(&selinux_secmark_refcount);
58}
59EXPORT_SYMBOL_GPL(selinux_secmark_refcount_inc);
60
61void selinux_secmark_refcount_dec(void)
62{
63 atomic_dec(&selinux_secmark_refcount);
64}
65EXPORT_SYMBOL_GPL(selinux_secmark_refcount_dec);
66 17
67bool selinux_is_enabled(void) 18bool selinux_is_enabled(void)
68{ 19{
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 4796ddd4e721..d9154cf90ae1 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -3354,11 +3354,11 @@ static int selinux_task_setrlimit(struct task_struct *p, unsigned int resource,
3354 return 0; 3354 return 0;
3355} 3355}
3356 3356
3357static int selinux_task_setscheduler(struct task_struct *p, int policy, struct sched_param *lp) 3357static int selinux_task_setscheduler(struct task_struct *p)
3358{ 3358{
3359 int rc; 3359 int rc;
3360 3360
3361 rc = cap_task_setscheduler(p, policy, lp); 3361 rc = cap_task_setscheduler(p);
3362 if (rc) 3362 if (rc)
3363 return rc; 3363 return rc;
3364 3364
@@ -4279,6 +4279,27 @@ static void selinux_inet_conn_established(struct sock *sk, struct sk_buff *skb)
4279 selinux_skb_peerlbl_sid(skb, family, &sksec->peer_sid); 4279 selinux_skb_peerlbl_sid(skb, family, &sksec->peer_sid);
4280} 4280}
4281 4281
4282static int selinux_secmark_relabel_packet(u32 sid)
4283{
4284 const struct task_security_struct *__tsec;
4285 u32 tsid;
4286
4287 __tsec = current_security();
4288 tsid = __tsec->sid;
4289
4290 return avc_has_perm(tsid, sid, SECCLASS_PACKET, PACKET__RELABELTO, NULL);
4291}
4292
4293static void selinux_secmark_refcount_inc(void)
4294{
4295 atomic_inc(&selinux_secmark_refcount);
4296}
4297
4298static void selinux_secmark_refcount_dec(void)
4299{
4300 atomic_dec(&selinux_secmark_refcount);
4301}
4302
4282static void selinux_req_classify_flow(const struct request_sock *req, 4303static void selinux_req_classify_flow(const struct request_sock *req,
4283 struct flowi *fl) 4304 struct flowi *fl)
4284{ 4305{
@@ -5533,6 +5554,9 @@ static struct security_operations selinux_ops = {
5533 .inet_conn_request = selinux_inet_conn_request, 5554 .inet_conn_request = selinux_inet_conn_request,
5534 .inet_csk_clone = selinux_inet_csk_clone, 5555 .inet_csk_clone = selinux_inet_csk_clone,
5535 .inet_conn_established = selinux_inet_conn_established, 5556 .inet_conn_established = selinux_inet_conn_established,
5557 .secmark_relabel_packet = selinux_secmark_relabel_packet,
5558 .secmark_refcount_inc = selinux_secmark_refcount_inc,
5559 .secmark_refcount_dec = selinux_secmark_refcount_dec,
5536 .req_classify_flow = selinux_req_classify_flow, 5560 .req_classify_flow = selinux_req_classify_flow,
5537 .tun_dev_create = selinux_tun_dev_create, 5561 .tun_dev_create = selinux_tun_dev_create,
5538 .tun_dev_post_create = selinux_tun_dev_post_create, 5562 .tun_dev_post_create = selinux_tun_dev_post_create,
diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h
index b4c9eb4bd6f9..8858d2b2d4b6 100644
--- a/security/selinux/include/classmap.h
+++ b/security/selinux/include/classmap.h
@@ -17,7 +17,7 @@ struct security_class_mapping secclass_map[] = {
17 { "compute_av", "compute_create", "compute_member", 17 { "compute_av", "compute_create", "compute_member",
18 "check_context", "load_policy", "compute_relabel", 18 "check_context", "load_policy", "compute_relabel",
19 "compute_user", "setenforce", "setbool", "setsecparam", 19 "compute_user", "setenforce", "setbool", "setsecparam",
20 "setcheckreqprot", NULL } }, 20 "setcheckreqprot", "read_policy", NULL } },
21 { "process", 21 { "process",
22 { "fork", "transition", "sigchld", "sigkill", 22 { "fork", "transition", "sigchld", "sigkill",
23 "sigstop", "signull", "signal", "ptrace", "getsched", "setsched", 23 "sigstop", "signull", "signal", "ptrace", "getsched", "setsched",
diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h
index 1f7c2491d3dc..671273eb1115 100644
--- a/security/selinux/include/security.h
+++ b/security/selinux/include/security.h
@@ -9,6 +9,7 @@
9#define _SELINUX_SECURITY_H_ 9#define _SELINUX_SECURITY_H_
10 10
11#include <linux/magic.h> 11#include <linux/magic.h>
12#include <linux/types.h>
12#include "flask.h" 13#include "flask.h"
13 14
14#define SECSID_NULL 0x00000000 /* unspecified SID */ 15#define SECSID_NULL 0x00000000 /* unspecified SID */
@@ -82,6 +83,8 @@ extern int selinux_policycap_openperm;
82int security_mls_enabled(void); 83int security_mls_enabled(void);
83 84
84int security_load_policy(void *data, size_t len); 85int security_load_policy(void *data, size_t len);
86int security_read_policy(void **data, ssize_t *len);
87size_t security_policydb_len(void);
85 88
86int security_policycap_supported(unsigned int req_cap); 89int security_policycap_supported(unsigned int req_cap);
87 90
@@ -191,5 +194,25 @@ static inline int security_netlbl_sid_to_secattr(u32 sid,
191 194
192const char *security_get_initial_sid_context(u32 sid); 195const char *security_get_initial_sid_context(u32 sid);
193 196
197/*
198 * status notifier using mmap interface
199 */
200extern struct page *selinux_kernel_status_page(void);
201
202#define SELINUX_KERNEL_STATUS_VERSION 1
203struct selinux_kernel_status {
204 u32 version; /* version number of thie structure */
205 u32 sequence; /* sequence number of seqlock logic */
206 u32 enforcing; /* current setting of enforcing mode */
207 u32 policyload; /* times of policy reloaded */
208 u32 deny_unknown; /* current setting of deny_unknown */
209 /*
210 * The version > 0 supports above members.
211 */
212} __attribute__((packed));
213
214extern void selinux_status_update_setenforce(int enforcing);
215extern void selinux_status_update_policyload(int seqno);
216
194#endif /* _SELINUX_SECURITY_H_ */ 217#endif /* _SELINUX_SECURITY_H_ */
195 218
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index 79a1bb635662..87e0556bae70 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -68,6 +68,8 @@ static int *bool_pending_values;
68static struct dentry *class_dir; 68static struct dentry *class_dir;
69static unsigned long last_class_ino; 69static unsigned long last_class_ino;
70 70
71static char policy_opened;
72
71/* global data for policy capabilities */ 73/* global data for policy capabilities */
72static struct dentry *policycap_dir; 74static struct dentry *policycap_dir;
73 75
@@ -110,6 +112,8 @@ enum sel_inos {
110 SEL_COMPAT_NET, /* whether to use old compat network packet controls */ 112 SEL_COMPAT_NET, /* whether to use old compat network packet controls */
111 SEL_REJECT_UNKNOWN, /* export unknown reject handling to userspace */ 113 SEL_REJECT_UNKNOWN, /* export unknown reject handling to userspace */
112 SEL_DENY_UNKNOWN, /* export unknown deny handling to userspace */ 114 SEL_DENY_UNKNOWN, /* export unknown deny handling to userspace */
115 SEL_STATUS, /* export current status using mmap() */
116 SEL_POLICY, /* allow userspace to read the in kernel policy */
113 SEL_INO_NEXT, /* The next inode number to use */ 117 SEL_INO_NEXT, /* The next inode number to use */
114}; 118};
115 119
@@ -171,6 +175,7 @@ static ssize_t sel_write_enforce(struct file *file, const char __user *buf,
171 if (selinux_enforcing) 175 if (selinux_enforcing)
172 avc_ss_reset(0); 176 avc_ss_reset(0);
173 selnl_notify_setenforce(selinux_enforcing); 177 selnl_notify_setenforce(selinux_enforcing);
178 selinux_status_update_setenforce(selinux_enforcing);
174 } 179 }
175 length = count; 180 length = count;
176out: 181out:
@@ -205,6 +210,59 @@ static const struct file_operations sel_handle_unknown_ops = {
205 .llseek = generic_file_llseek, 210 .llseek = generic_file_llseek,
206}; 211};
207 212
213static int sel_open_handle_status(struct inode *inode, struct file *filp)
214{
215 struct page *status = selinux_kernel_status_page();
216
217 if (!status)
218 return -ENOMEM;
219
220 filp->private_data = status;
221
222 return 0;
223}
224
225static ssize_t sel_read_handle_status(struct file *filp, char __user *buf,
226 size_t count, loff_t *ppos)
227{
228 struct page *status = filp->private_data;
229
230 BUG_ON(!status);
231
232 return simple_read_from_buffer(buf, count, ppos,
233 page_address(status),
234 sizeof(struct selinux_kernel_status));
235}
236
237static int sel_mmap_handle_status(struct file *filp,
238 struct vm_area_struct *vma)
239{
240 struct page *status = filp->private_data;
241 unsigned long size = vma->vm_end - vma->vm_start;
242
243 BUG_ON(!status);
244
245 /* only allows one page from the head */
246 if (vma->vm_pgoff > 0 || size != PAGE_SIZE)
247 return -EIO;
248 /* disallow writable mapping */
249 if (vma->vm_flags & VM_WRITE)
250 return -EPERM;
251 /* disallow mprotect() turns it into writable */
252 vma->vm_flags &= ~VM_MAYWRITE;
253
254 return remap_pfn_range(vma, vma->vm_start,
255 page_to_pfn(status),
256 size, vma->vm_page_prot);
257}
258
259static const struct file_operations sel_handle_status_ops = {
260 .open = sel_open_handle_status,
261 .read = sel_read_handle_status,
262 .mmap = sel_mmap_handle_status,
263 .llseek = generic_file_llseek,
264};
265
208#ifdef CONFIG_SECURITY_SELINUX_DISABLE 266#ifdef CONFIG_SECURITY_SELINUX_DISABLE
209static ssize_t sel_write_disable(struct file *file, const char __user *buf, 267static ssize_t sel_write_disable(struct file *file, const char __user *buf,
210 size_t count, loff_t *ppos) 268 size_t count, loff_t *ppos)
@@ -296,6 +354,141 @@ static const struct file_operations sel_mls_ops = {
296 .llseek = generic_file_llseek, 354 .llseek = generic_file_llseek,
297}; 355};
298 356
357struct policy_load_memory {
358 size_t len;
359 void *data;
360};
361
362static int sel_open_policy(struct inode *inode, struct file *filp)
363{
364 struct policy_load_memory *plm = NULL;
365 int rc;
366
367 BUG_ON(filp->private_data);
368
369 mutex_lock(&sel_mutex);
370
371 rc = task_has_security(current, SECURITY__READ_POLICY);
372 if (rc)
373 goto err;
374
375 rc = -EBUSY;
376 if (policy_opened)
377 goto err;
378
379 rc = -ENOMEM;
380 plm = kzalloc(sizeof(*plm), GFP_KERNEL);
381 if (!plm)
382 goto err;
383
384 if (i_size_read(inode) != security_policydb_len()) {
385 mutex_lock(&inode->i_mutex);
386 i_size_write(inode, security_policydb_len());
387 mutex_unlock(&inode->i_mutex);
388 }
389
390 rc = security_read_policy(&plm->data, &plm->len);
391 if (rc)
392 goto err;
393
394 policy_opened = 1;
395
396 filp->private_data = plm;
397
398 mutex_unlock(&sel_mutex);
399
400 return 0;
401err:
402 mutex_unlock(&sel_mutex);
403
404 if (plm)
405 vfree(plm->data);
406 kfree(plm);
407 return rc;
408}
409
410static int sel_release_policy(struct inode *inode, struct file *filp)
411{
412 struct policy_load_memory *plm = filp->private_data;
413
414 BUG_ON(!plm);
415
416 policy_opened = 0;
417
418 vfree(plm->data);
419 kfree(plm);
420
421 return 0;
422}
423
424static ssize_t sel_read_policy(struct file *filp, char __user *buf,
425 size_t count, loff_t *ppos)
426{
427 struct policy_load_memory *plm = filp->private_data;
428 int ret;
429
430 mutex_lock(&sel_mutex);
431
432 ret = task_has_security(current, SECURITY__READ_POLICY);
433 if (ret)
434 goto out;
435
436 ret = simple_read_from_buffer(buf, count, ppos, plm->data, plm->len);
437out:
438 mutex_unlock(&sel_mutex);
439 return ret;
440}
441
442static int sel_mmap_policy_fault(struct vm_area_struct *vma,
443 struct vm_fault *vmf)
444{
445 struct policy_load_memory *plm = vma->vm_file->private_data;
446 unsigned long offset;
447 struct page *page;
448
449 if (vmf->flags & (FAULT_FLAG_MKWRITE | FAULT_FLAG_WRITE))
450 return VM_FAULT_SIGBUS;
451
452 offset = vmf->pgoff << PAGE_SHIFT;
453 if (offset >= roundup(plm->len, PAGE_SIZE))
454 return VM_FAULT_SIGBUS;
455
456 page = vmalloc_to_page(plm->data + offset);
457 get_page(page);
458
459 vmf->page = page;
460
461 return 0;
462}
463
464static struct vm_operations_struct sel_mmap_policy_ops = {
465 .fault = sel_mmap_policy_fault,
466 .page_mkwrite = sel_mmap_policy_fault,
467};
468
469int sel_mmap_policy(struct file *filp, struct vm_area_struct *vma)
470{
471 if (vma->vm_flags & VM_SHARED) {
472 /* do not allow mprotect to make mapping writable */
473 vma->vm_flags &= ~VM_MAYWRITE;
474
475 if (vma->vm_flags & VM_WRITE)
476 return -EACCES;
477 }
478
479 vma->vm_flags |= VM_RESERVED;
480 vma->vm_ops = &sel_mmap_policy_ops;
481
482 return 0;
483}
484
485static const struct file_operations sel_policy_ops = {
486 .open = sel_open_policy,
487 .read = sel_read_policy,
488 .mmap = sel_mmap_policy,
489 .release = sel_release_policy,
490};
491
299static ssize_t sel_write_load(struct file *file, const char __user *buf, 492static ssize_t sel_write_load(struct file *file, const char __user *buf,
300 size_t count, loff_t *ppos) 493 size_t count, loff_t *ppos)
301 494
@@ -1612,6 +1805,8 @@ static int sel_fill_super(struct super_block *sb, void *data, int silent)
1612 [SEL_CHECKREQPROT] = {"checkreqprot", &sel_checkreqprot_ops, S_IRUGO|S_IWUSR}, 1805 [SEL_CHECKREQPROT] = {"checkreqprot", &sel_checkreqprot_ops, S_IRUGO|S_IWUSR},
1613 [SEL_REJECT_UNKNOWN] = {"reject_unknown", &sel_handle_unknown_ops, S_IRUGO}, 1806 [SEL_REJECT_UNKNOWN] = {"reject_unknown", &sel_handle_unknown_ops, S_IRUGO},
1614 [SEL_DENY_UNKNOWN] = {"deny_unknown", &sel_handle_unknown_ops, S_IRUGO}, 1807 [SEL_DENY_UNKNOWN] = {"deny_unknown", &sel_handle_unknown_ops, S_IRUGO},
1808 [SEL_STATUS] = {"status", &sel_handle_status_ops, S_IRUGO},
1809 [SEL_POLICY] = {"policy", &sel_policy_ops, S_IRUSR},
1615 /* last one */ {""} 1810 /* last one */ {""}
1616 }; 1811 };
1617 ret = simple_fill_super(sb, SELINUX_MAGIC, selinux_files); 1812 ret = simple_fill_super(sb, SELINUX_MAGIC, selinux_files);
diff --git a/security/selinux/ss/Makefile b/security/selinux/ss/Makefile
deleted file mode 100644
index 15d4e62917de..000000000000
--- a/security/selinux/ss/Makefile
+++ /dev/null
@@ -1,9 +0,0 @@
1#
2# Makefile for building the SELinux security server as part of the kernel tree.
3#
4
5EXTRA_CFLAGS += -Isecurity/selinux -Isecurity/selinux/include
6obj-y := ss.o
7
8ss-y := ebitmap.o hashtab.o symtab.o sidtab.o avtab.o policydb.o services.o conditional.o mls.o
9
diff --git a/security/selinux/ss/avtab.c b/security/selinux/ss/avtab.c
index 929480c6c430..a3dd9faa19c0 100644
--- a/security/selinux/ss/avtab.c
+++ b/security/selinux/ss/avtab.c
@@ -266,8 +266,8 @@ int avtab_alloc(struct avtab *h, u32 nrules)
266 if (shift > 2) 266 if (shift > 2)
267 shift = shift - 2; 267 shift = shift - 2;
268 nslot = 1 << shift; 268 nslot = 1 << shift;
269 if (nslot > MAX_AVTAB_SIZE) 269 if (nslot > MAX_AVTAB_HASH_BUCKETS)
270 nslot = MAX_AVTAB_SIZE; 270 nslot = MAX_AVTAB_HASH_BUCKETS;
271 mask = nslot - 1; 271 mask = nslot - 1;
272 272
273 h->htable = kcalloc(nslot, sizeof(*(h->htable)), GFP_KERNEL); 273 h->htable = kcalloc(nslot, sizeof(*(h->htable)), GFP_KERNEL);
@@ -501,6 +501,48 @@ bad:
501 goto out; 501 goto out;
502} 502}
503 503
504int avtab_write_item(struct policydb *p, struct avtab_node *cur, void *fp)
505{
506 __le16 buf16[4];
507 __le32 buf32[1];
508 int rc;
509
510 buf16[0] = cpu_to_le16(cur->key.source_type);
511 buf16[1] = cpu_to_le16(cur->key.target_type);
512 buf16[2] = cpu_to_le16(cur->key.target_class);
513 buf16[3] = cpu_to_le16(cur->key.specified);
514 rc = put_entry(buf16, sizeof(u16), 4, fp);
515 if (rc)
516 return rc;
517 buf32[0] = cpu_to_le32(cur->datum.data);
518 rc = put_entry(buf32, sizeof(u32), 1, fp);
519 if (rc)
520 return rc;
521 return 0;
522}
523
524int avtab_write(struct policydb *p, struct avtab *a, void *fp)
525{
526 unsigned int i;
527 int rc = 0;
528 struct avtab_node *cur;
529 __le32 buf[1];
530
531 buf[0] = cpu_to_le32(a->nel);
532 rc = put_entry(buf, sizeof(u32), 1, fp);
533 if (rc)
534 return rc;
535
536 for (i = 0; i < a->nslot; i++) {
537 for (cur = a->htable[i]; cur; cur = cur->next) {
538 rc = avtab_write_item(p, cur, fp);
539 if (rc)
540 return rc;
541 }
542 }
543
544 return rc;
545}
504void avtab_cache_init(void) 546void avtab_cache_init(void)
505{ 547{
506 avtab_node_cachep = kmem_cache_create("avtab_node", 548 avtab_node_cachep = kmem_cache_create("avtab_node",
diff --git a/security/selinux/ss/avtab.h b/security/selinux/ss/avtab.h
index cd4f734e2749..dff0c75345c1 100644
--- a/security/selinux/ss/avtab.h
+++ b/security/selinux/ss/avtab.h
@@ -71,6 +71,8 @@ int avtab_read_item(struct avtab *a, void *fp, struct policydb *pol,
71 void *p); 71 void *p);
72 72
73int avtab_read(struct avtab *a, void *fp, struct policydb *pol); 73int avtab_read(struct avtab *a, void *fp, struct policydb *pol);
74int avtab_write_item(struct policydb *p, struct avtab_node *cur, void *fp);
75int avtab_write(struct policydb *p, struct avtab *a, void *fp);
74 76
75struct avtab_node *avtab_insert_nonunique(struct avtab *h, struct avtab_key *key, 77struct avtab_node *avtab_insert_nonunique(struct avtab *h, struct avtab_key *key,
76 struct avtab_datum *datum); 78 struct avtab_datum *datum);
@@ -85,7 +87,6 @@ void avtab_cache_destroy(void);
85#define MAX_AVTAB_HASH_BITS 11 87#define MAX_AVTAB_HASH_BITS 11
86#define MAX_AVTAB_HASH_BUCKETS (1 << MAX_AVTAB_HASH_BITS) 88#define MAX_AVTAB_HASH_BUCKETS (1 << MAX_AVTAB_HASH_BITS)
87#define MAX_AVTAB_HASH_MASK (MAX_AVTAB_HASH_BUCKETS-1) 89#define MAX_AVTAB_HASH_MASK (MAX_AVTAB_HASH_BUCKETS-1)
88#define MAX_AVTAB_SIZE MAX_AVTAB_HASH_BUCKETS
89 90
90#endif /* _SS_AVTAB_H_ */ 91#endif /* _SS_AVTAB_H_ */
91 92
diff --git a/security/selinux/ss/conditional.c b/security/selinux/ss/conditional.c
index c91e150c3087..655fe1c6cc69 100644
--- a/security/selinux/ss/conditional.c
+++ b/security/selinux/ss/conditional.c
@@ -490,6 +490,129 @@ err:
490 return rc; 490 return rc;
491} 491}
492 492
493int cond_write_bool(void *vkey, void *datum, void *ptr)
494{
495 char *key = vkey;
496 struct cond_bool_datum *booldatum = datum;
497 struct policy_data *pd = ptr;
498 void *fp = pd->fp;
499 __le32 buf[3];
500 u32 len;
501 int rc;
502
503 len = strlen(key);
504 buf[0] = cpu_to_le32(booldatum->value);
505 buf[1] = cpu_to_le32(booldatum->state);
506 buf[2] = cpu_to_le32(len);
507 rc = put_entry(buf, sizeof(u32), 3, fp);
508 if (rc)
509 return rc;
510 rc = put_entry(key, 1, len, fp);
511 if (rc)
512 return rc;
513 return 0;
514}
515
516/*
517 * cond_write_cond_av_list doesn't write out the av_list nodes.
518 * Instead it writes out the key/value pairs from the avtab. This
519 * is necessary because there is no way to uniquely identifying rules
520 * in the avtab so it is not possible to associate individual rules
521 * in the avtab with a conditional without saving them as part of
522 * the conditional. This means that the avtab with the conditional
523 * rules will not be saved but will be rebuilt on policy load.
524 */
525static int cond_write_av_list(struct policydb *p,
526 struct cond_av_list *list, struct policy_file *fp)
527{
528 __le32 buf[1];
529 struct cond_av_list *cur_list;
530 u32 len;
531 int rc;
532
533 len = 0;
534 for (cur_list = list; cur_list != NULL; cur_list = cur_list->next)
535 len++;
536
537 buf[0] = cpu_to_le32(len);
538 rc = put_entry(buf, sizeof(u32), 1, fp);
539 if (rc)
540 return rc;
541
542 if (len == 0)
543 return 0;
544
545 for (cur_list = list; cur_list != NULL; cur_list = cur_list->next) {
546 rc = avtab_write_item(p, cur_list->node, fp);
547 if (rc)
548 return rc;
549 }
550
551 return 0;
552}
553
554int cond_write_node(struct policydb *p, struct cond_node *node,
555 struct policy_file *fp)
556{
557 struct cond_expr *cur_expr;
558 __le32 buf[2];
559 int rc;
560 u32 len = 0;
561
562 buf[0] = cpu_to_le32(node->cur_state);
563 rc = put_entry(buf, sizeof(u32), 1, fp);
564 if (rc)
565 return rc;
566
567 for (cur_expr = node->expr; cur_expr != NULL; cur_expr = cur_expr->next)
568 len++;
569
570 buf[0] = cpu_to_le32(len);
571 rc = put_entry(buf, sizeof(u32), 1, fp);
572 if (rc)
573 return rc;
574
575 for (cur_expr = node->expr; cur_expr != NULL; cur_expr = cur_expr->next) {
576 buf[0] = cpu_to_le32(cur_expr->expr_type);
577 buf[1] = cpu_to_le32(cur_expr->bool);
578 rc = put_entry(buf, sizeof(u32), 2, fp);
579 if (rc)
580 return rc;
581 }
582
583 rc = cond_write_av_list(p, node->true_list, fp);
584 if (rc)
585 return rc;
586 rc = cond_write_av_list(p, node->false_list, fp);
587 if (rc)
588 return rc;
589
590 return 0;
591}
592
593int cond_write_list(struct policydb *p, struct cond_node *list, void *fp)
594{
595 struct cond_node *cur;
596 u32 len;
597 __le32 buf[1];
598 int rc;
599
600 len = 0;
601 for (cur = list; cur != NULL; cur = cur->next)
602 len++;
603 buf[0] = cpu_to_le32(len);
604 rc = put_entry(buf, sizeof(u32), 1, fp);
605 if (rc)
606 return rc;
607
608 for (cur = list; cur != NULL; cur = cur->next) {
609 rc = cond_write_node(p, cur, fp);
610 if (rc)
611 return rc;
612 }
613
614 return 0;
615}
493/* Determine whether additional permissions are granted by the conditional 616/* Determine whether additional permissions are granted by the conditional
494 * av table, and if so, add them to the result 617 * av table, and if so, add them to the result
495 */ 618 */
diff --git a/security/selinux/ss/conditional.h b/security/selinux/ss/conditional.h
index 53ddb013ae57..3f209c635295 100644
--- a/security/selinux/ss/conditional.h
+++ b/security/selinux/ss/conditional.h
@@ -69,6 +69,8 @@ int cond_index_bool(void *key, void *datum, void *datap);
69 69
70int cond_read_bool(struct policydb *p, struct hashtab *h, void *fp); 70int cond_read_bool(struct policydb *p, struct hashtab *h, void *fp);
71int cond_read_list(struct policydb *p, void *fp); 71int cond_read_list(struct policydb *p, void *fp);
72int cond_write_bool(void *key, void *datum, void *ptr);
73int cond_write_list(struct policydb *p, struct cond_node *list, void *fp);
72 74
73void cond_compute_av(struct avtab *ctab, struct avtab_key *key, struct av_decision *avd); 75void cond_compute_av(struct avtab *ctab, struct avtab_key *key, struct av_decision *avd);
74 76
diff --git a/security/selinux/ss/ebitmap.c b/security/selinux/ss/ebitmap.c
index 04b6145d767f..d42951fcbe87 100644
--- a/security/selinux/ss/ebitmap.c
+++ b/security/selinux/ss/ebitmap.c
@@ -22,6 +22,8 @@
22#include "ebitmap.h" 22#include "ebitmap.h"
23#include "policydb.h" 23#include "policydb.h"
24 24
25#define BITS_PER_U64 (sizeof(u64) * 8)
26
25int ebitmap_cmp(struct ebitmap *e1, struct ebitmap *e2) 27int ebitmap_cmp(struct ebitmap *e1, struct ebitmap *e2)
26{ 28{
27 struct ebitmap_node *n1, *n2; 29 struct ebitmap_node *n1, *n2;
@@ -363,10 +365,10 @@ int ebitmap_read(struct ebitmap *e, void *fp)
363 e->highbit = le32_to_cpu(buf[1]); 365 e->highbit = le32_to_cpu(buf[1]);
364 count = le32_to_cpu(buf[2]); 366 count = le32_to_cpu(buf[2]);
365 367
366 if (mapunit != sizeof(u64) * 8) { 368 if (mapunit != BITS_PER_U64) {
367 printk(KERN_ERR "SELinux: ebitmap: map size %u does not " 369 printk(KERN_ERR "SELinux: ebitmap: map size %u does not "
368 "match my size %Zd (high bit was %d)\n", 370 "match my size %Zd (high bit was %d)\n",
369 mapunit, sizeof(u64) * 8, e->highbit); 371 mapunit, BITS_PER_U64, e->highbit);
370 goto bad; 372 goto bad;
371 } 373 }
372 374
@@ -446,3 +448,78 @@ bad:
446 ebitmap_destroy(e); 448 ebitmap_destroy(e);
447 goto out; 449 goto out;
448} 450}
451
452int ebitmap_write(struct ebitmap *e, void *fp)
453{
454 struct ebitmap_node *n;
455 u32 count;
456 __le32 buf[3];
457 u64 map;
458 int bit, last_bit, last_startbit, rc;
459
460 buf[0] = cpu_to_le32(BITS_PER_U64);
461
462 count = 0;
463 last_bit = 0;
464 last_startbit = -1;
465 ebitmap_for_each_positive_bit(e, n, bit) {
466 if (rounddown(bit, (int)BITS_PER_U64) > last_startbit) {
467 count++;
468 last_startbit = rounddown(bit, BITS_PER_U64);
469 }
470 last_bit = roundup(bit + 1, BITS_PER_U64);
471 }
472 buf[1] = cpu_to_le32(last_bit);
473 buf[2] = cpu_to_le32(count);
474
475 rc = put_entry(buf, sizeof(u32), 3, fp);
476 if (rc)
477 return rc;
478
479 map = 0;
480 last_startbit = INT_MIN;
481 ebitmap_for_each_positive_bit(e, n, bit) {
482 if (rounddown(bit, (int)BITS_PER_U64) > last_startbit) {
483 __le64 buf64[1];
484
485 /* this is the very first bit */
486 if (!map) {
487 last_startbit = rounddown(bit, BITS_PER_U64);
488 map = (u64)1 << (bit - last_startbit);
489 continue;
490 }
491
492 /* write the last node */
493 buf[0] = cpu_to_le32(last_startbit);
494 rc = put_entry(buf, sizeof(u32), 1, fp);
495 if (rc)
496 return rc;
497
498 buf64[0] = cpu_to_le64(map);
499 rc = put_entry(buf64, sizeof(u64), 1, fp);
500 if (rc)
501 return rc;
502
503 /* set up for the next node */
504 map = 0;
505 last_startbit = rounddown(bit, BITS_PER_U64);
506 }
507 map |= (u64)1 << (bit - last_startbit);
508 }
509 /* write the last node */
510 if (map) {
511 __le64 buf64[1];
512
513 /* write the last node */
514 buf[0] = cpu_to_le32(last_startbit);
515 rc = put_entry(buf, sizeof(u32), 1, fp);
516 if (rc)
517 return rc;
518
519 buf64[0] = cpu_to_le64(map);
520 rc = put_entry(buf64, sizeof(u64), 1, fp);
521 if (rc)
522 return rc;
523 }
524 return 0;
525}
diff --git a/security/selinux/ss/ebitmap.h b/security/selinux/ss/ebitmap.h
index f283b4367f54..1f4e93c2ae86 100644
--- a/security/selinux/ss/ebitmap.h
+++ b/security/selinux/ss/ebitmap.h
@@ -123,6 +123,7 @@ int ebitmap_get_bit(struct ebitmap *e, unsigned long bit);
123int ebitmap_set_bit(struct ebitmap *e, unsigned long bit, int value); 123int ebitmap_set_bit(struct ebitmap *e, unsigned long bit, int value);
124void ebitmap_destroy(struct ebitmap *e); 124void ebitmap_destroy(struct ebitmap *e);
125int ebitmap_read(struct ebitmap *e, void *fp); 125int ebitmap_read(struct ebitmap *e, void *fp);
126int ebitmap_write(struct ebitmap *e, void *fp);
126 127
127#ifdef CONFIG_NETLABEL 128#ifdef CONFIG_NETLABEL
128int ebitmap_netlbl_export(struct ebitmap *ebmap, 129int ebitmap_netlbl_export(struct ebitmap *ebmap,
diff --git a/security/selinux/ss/policydb.c b/security/selinux/ss/policydb.c
index 3a29704be8ce..94f630d93a5c 100644
--- a/security/selinux/ss/policydb.c
+++ b/security/selinux/ss/policydb.c
@@ -37,6 +37,7 @@
37#include "policydb.h" 37#include "policydb.h"
38#include "conditional.h" 38#include "conditional.h"
39#include "mls.h" 39#include "mls.h"
40#include "services.h"
40 41
41#define _DEBUG_HASHES 42#define _DEBUG_HASHES
42 43
@@ -185,9 +186,19 @@ static u32 rangetr_hash(struct hashtab *h, const void *k)
185static int rangetr_cmp(struct hashtab *h, const void *k1, const void *k2) 186static int rangetr_cmp(struct hashtab *h, const void *k1, const void *k2)
186{ 187{
187 const struct range_trans *key1 = k1, *key2 = k2; 188 const struct range_trans *key1 = k1, *key2 = k2;
188 return (key1->source_type != key2->source_type || 189 int v;
189 key1->target_type != key2->target_type || 190
190 key1->target_class != key2->target_class); 191 v = key1->source_type - key2->source_type;
192 if (v)
193 return v;
194
195 v = key1->target_type - key2->target_type;
196 if (v)
197 return v;
198
199 v = key1->target_class - key2->target_class;
200
201 return v;
191} 202}
192 203
193/* 204/*
@@ -1624,11 +1635,11 @@ static int role_bounds_sanity_check(void *key, void *datum, void *datap)
1624 1635
1625static int type_bounds_sanity_check(void *key, void *datum, void *datap) 1636static int type_bounds_sanity_check(void *key, void *datum, void *datap)
1626{ 1637{
1627 struct type_datum *upper, *type; 1638 struct type_datum *upper;
1628 struct policydb *p = datap; 1639 struct policydb *p = datap;
1629 int depth = 0; 1640 int depth = 0;
1630 1641
1631 upper = type = datum; 1642 upper = datum;
1632 while (upper->bounds) { 1643 while (upper->bounds) {
1633 if (++depth == POLICYDB_BOUNDS_MAXDEPTH) { 1644 if (++depth == POLICYDB_BOUNDS_MAXDEPTH) {
1634 printk(KERN_ERR "SELinux: type %s: " 1645 printk(KERN_ERR "SELinux: type %s: "
@@ -2306,3 +2317,843 @@ bad:
2306 policydb_destroy(p); 2317 policydb_destroy(p);
2307 goto out; 2318 goto out;
2308} 2319}
2320
2321/*
2322 * Write a MLS level structure to a policydb binary
2323 * representation file.
2324 */
2325static int mls_write_level(struct mls_level *l, void *fp)
2326{
2327 __le32 buf[1];
2328 int rc;
2329
2330 buf[0] = cpu_to_le32(l->sens);
2331 rc = put_entry(buf, sizeof(u32), 1, fp);
2332 if (rc)
2333 return rc;
2334
2335 rc = ebitmap_write(&l->cat, fp);
2336 if (rc)
2337 return rc;
2338
2339 return 0;
2340}
2341
2342/*
2343 * Write a MLS range structure to a policydb binary
2344 * representation file.
2345 */
2346static int mls_write_range_helper(struct mls_range *r, void *fp)
2347{
2348 __le32 buf[3];
2349 size_t items;
2350 int rc, eq;
2351
2352 eq = mls_level_eq(&r->level[1], &r->level[0]);
2353
2354 if (eq)
2355 items = 2;
2356 else
2357 items = 3;
2358 buf[0] = cpu_to_le32(items-1);
2359 buf[1] = cpu_to_le32(r->level[0].sens);
2360 if (!eq)
2361 buf[2] = cpu_to_le32(r->level[1].sens);
2362
2363 BUG_ON(items > (sizeof(buf)/sizeof(buf[0])));
2364
2365 rc = put_entry(buf, sizeof(u32), items, fp);
2366 if (rc)
2367 return rc;
2368
2369 rc = ebitmap_write(&r->level[0].cat, fp);
2370 if (rc)
2371 return rc;
2372 if (!eq) {
2373 rc = ebitmap_write(&r->level[1].cat, fp);
2374 if (rc)
2375 return rc;
2376 }
2377
2378 return 0;
2379}
2380
2381static int sens_write(void *vkey, void *datum, void *ptr)
2382{
2383 char *key = vkey;
2384 struct level_datum *levdatum = datum;
2385 struct policy_data *pd = ptr;
2386 void *fp = pd->fp;
2387 __le32 buf[2];
2388 size_t len;
2389 int rc;
2390
2391 len = strlen(key);
2392 buf[0] = cpu_to_le32(len);
2393 buf[1] = cpu_to_le32(levdatum->isalias);
2394 rc = put_entry(buf, sizeof(u32), 2, fp);
2395 if (rc)
2396 return rc;
2397
2398 rc = put_entry(key, 1, len, fp);
2399 if (rc)
2400 return rc;
2401
2402 rc = mls_write_level(levdatum->level, fp);
2403 if (rc)
2404 return rc;
2405
2406 return 0;
2407}
2408
2409static int cat_write(void *vkey, void *datum, void *ptr)
2410{
2411 char *key = vkey;
2412 struct cat_datum *catdatum = datum;
2413 struct policy_data *pd = ptr;
2414 void *fp = pd->fp;
2415 __le32 buf[3];
2416 size_t len;
2417 int rc;
2418
2419 len = strlen(key);
2420 buf[0] = cpu_to_le32(len);
2421 buf[1] = cpu_to_le32(catdatum->value);
2422 buf[2] = cpu_to_le32(catdatum->isalias);
2423 rc = put_entry(buf, sizeof(u32), 3, fp);
2424 if (rc)
2425 return rc;
2426
2427 rc = put_entry(key, 1, len, fp);
2428 if (rc)
2429 return rc;
2430
2431 return 0;
2432}
2433
2434static int role_trans_write(struct role_trans *r, void *fp)
2435{
2436 struct role_trans *tr;
2437 u32 buf[3];
2438 size_t nel;
2439 int rc;
2440
2441 nel = 0;
2442 for (tr = r; tr; tr = tr->next)
2443 nel++;
2444 buf[0] = cpu_to_le32(nel);
2445 rc = put_entry(buf, sizeof(u32), 1, fp);
2446 if (rc)
2447 return rc;
2448 for (tr = r; tr; tr = tr->next) {
2449 buf[0] = cpu_to_le32(tr->role);
2450 buf[1] = cpu_to_le32(tr->type);
2451 buf[2] = cpu_to_le32(tr->new_role);
2452 rc = put_entry(buf, sizeof(u32), 3, fp);
2453 if (rc)
2454 return rc;
2455 }
2456
2457 return 0;
2458}
2459
2460static int role_allow_write(struct role_allow *r, void *fp)
2461{
2462 struct role_allow *ra;
2463 u32 buf[2];
2464 size_t nel;
2465 int rc;
2466
2467 nel = 0;
2468 for (ra = r; ra; ra = ra->next)
2469 nel++;
2470 buf[0] = cpu_to_le32(nel);
2471 rc = put_entry(buf, sizeof(u32), 1, fp);
2472 if (rc)
2473 return rc;
2474 for (ra = r; ra; ra = ra->next) {
2475 buf[0] = cpu_to_le32(ra->role);
2476 buf[1] = cpu_to_le32(ra->new_role);
2477 rc = put_entry(buf, sizeof(u32), 2, fp);
2478 if (rc)
2479 return rc;
2480 }
2481 return 0;
2482}
2483
2484/*
2485 * Write a security context structure
2486 * to a policydb binary representation file.
2487 */
2488static int context_write(struct policydb *p, struct context *c,
2489 void *fp)
2490{
2491 int rc;
2492 __le32 buf[3];
2493
2494 buf[0] = cpu_to_le32(c->user);
2495 buf[1] = cpu_to_le32(c->role);
2496 buf[2] = cpu_to_le32(c->type);
2497
2498 rc = put_entry(buf, sizeof(u32), 3, fp);
2499 if (rc)
2500 return rc;
2501
2502 rc = mls_write_range_helper(&c->range, fp);
2503 if (rc)
2504 return rc;
2505
2506 return 0;
2507}
2508
2509/*
2510 * The following *_write functions are used to
2511 * write the symbol data to a policy database
2512 * binary representation file.
2513 */
2514
2515static int perm_write(void *vkey, void *datum, void *fp)
2516{
2517 char *key = vkey;
2518 struct perm_datum *perdatum = datum;
2519 __le32 buf[2];
2520 size_t len;
2521 int rc;
2522
2523 len = strlen(key);
2524 buf[0] = cpu_to_le32(len);
2525 buf[1] = cpu_to_le32(perdatum->value);
2526 rc = put_entry(buf, sizeof(u32), 2, fp);
2527 if (rc)
2528 return rc;
2529
2530 rc = put_entry(key, 1, len, fp);
2531 if (rc)
2532 return rc;
2533
2534 return 0;
2535}
2536
2537static int common_write(void *vkey, void *datum, void *ptr)
2538{
2539 char *key = vkey;
2540 struct common_datum *comdatum = datum;
2541 struct policy_data *pd = ptr;
2542 void *fp = pd->fp;
2543 __le32 buf[4];
2544 size_t len;
2545 int rc;
2546
2547 len = strlen(key);
2548 buf[0] = cpu_to_le32(len);
2549 buf[1] = cpu_to_le32(comdatum->value);
2550 buf[2] = cpu_to_le32(comdatum->permissions.nprim);
2551 buf[3] = cpu_to_le32(comdatum->permissions.table->nel);
2552 rc = put_entry(buf, sizeof(u32), 4, fp);
2553 if (rc)
2554 return rc;
2555
2556 rc = put_entry(key, 1, len, fp);
2557 if (rc)
2558 return rc;
2559
2560 rc = hashtab_map(comdatum->permissions.table, perm_write, fp);
2561 if (rc)
2562 return rc;
2563
2564 return 0;
2565}
2566
2567static int write_cons_helper(struct policydb *p, struct constraint_node *node,
2568 void *fp)
2569{
2570 struct constraint_node *c;
2571 struct constraint_expr *e;
2572 __le32 buf[3];
2573 u32 nel;
2574 int rc;
2575
2576 for (c = node; c; c = c->next) {
2577 nel = 0;
2578 for (e = c->expr; e; e = e->next)
2579 nel++;
2580 buf[0] = cpu_to_le32(c->permissions);
2581 buf[1] = cpu_to_le32(nel);
2582 rc = put_entry(buf, sizeof(u32), 2, fp);
2583 if (rc)
2584 return rc;
2585 for (e = c->expr; e; e = e->next) {
2586 buf[0] = cpu_to_le32(e->expr_type);
2587 buf[1] = cpu_to_le32(e->attr);
2588 buf[2] = cpu_to_le32(e->op);
2589 rc = put_entry(buf, sizeof(u32), 3, fp);
2590 if (rc)
2591 return rc;
2592
2593 switch (e->expr_type) {
2594 case CEXPR_NAMES:
2595 rc = ebitmap_write(&e->names, fp);
2596 if (rc)
2597 return rc;
2598 break;
2599 default:
2600 break;
2601 }
2602 }
2603 }
2604
2605 return 0;
2606}
2607
2608static int class_write(void *vkey, void *datum, void *ptr)
2609{
2610 char *key = vkey;
2611 struct class_datum *cladatum = datum;
2612 struct policy_data *pd = ptr;
2613 void *fp = pd->fp;
2614 struct policydb *p = pd->p;
2615 struct constraint_node *c;
2616 __le32 buf[6];
2617 u32 ncons;
2618 size_t len, len2;
2619 int rc;
2620
2621 len = strlen(key);
2622 if (cladatum->comkey)
2623 len2 = strlen(cladatum->comkey);
2624 else
2625 len2 = 0;
2626
2627 ncons = 0;
2628 for (c = cladatum->constraints; c; c = c->next)
2629 ncons++;
2630
2631 buf[0] = cpu_to_le32(len);
2632 buf[1] = cpu_to_le32(len2);
2633 buf[2] = cpu_to_le32(cladatum->value);
2634 buf[3] = cpu_to_le32(cladatum->permissions.nprim);
2635 if (cladatum->permissions.table)
2636 buf[4] = cpu_to_le32(cladatum->permissions.table->nel);
2637 else
2638 buf[4] = 0;
2639 buf[5] = cpu_to_le32(ncons);
2640 rc = put_entry(buf, sizeof(u32), 6, fp);
2641 if (rc)
2642 return rc;
2643
2644 rc = put_entry(key, 1, len, fp);
2645 if (rc)
2646 return rc;
2647
2648 if (cladatum->comkey) {
2649 rc = put_entry(cladatum->comkey, 1, len2, fp);
2650 if (rc)
2651 return rc;
2652 }
2653
2654 rc = hashtab_map(cladatum->permissions.table, perm_write, fp);
2655 if (rc)
2656 return rc;
2657
2658 rc = write_cons_helper(p, cladatum->constraints, fp);
2659 if (rc)
2660 return rc;
2661
2662 /* write out the validatetrans rule */
2663 ncons = 0;
2664 for (c = cladatum->validatetrans; c; c = c->next)
2665 ncons++;
2666
2667 buf[0] = cpu_to_le32(ncons);
2668 rc = put_entry(buf, sizeof(u32), 1, fp);
2669 if (rc)
2670 return rc;
2671
2672 rc = write_cons_helper(p, cladatum->validatetrans, fp);
2673 if (rc)
2674 return rc;
2675
2676 return 0;
2677}
2678
2679static int role_write(void *vkey, void *datum, void *ptr)
2680{
2681 char *key = vkey;
2682 struct role_datum *role = datum;
2683 struct policy_data *pd = ptr;
2684 void *fp = pd->fp;
2685 struct policydb *p = pd->p;
2686 __le32 buf[3];
2687 size_t items, len;
2688 int rc;
2689
2690 len = strlen(key);
2691 items = 0;
2692 buf[items++] = cpu_to_le32(len);
2693 buf[items++] = cpu_to_le32(role->value);
2694 if (p->policyvers >= POLICYDB_VERSION_BOUNDARY)
2695 buf[items++] = cpu_to_le32(role->bounds);
2696
2697 BUG_ON(items > (sizeof(buf)/sizeof(buf[0])));
2698
2699 rc = put_entry(buf, sizeof(u32), items, fp);
2700 if (rc)
2701 return rc;
2702
2703 rc = put_entry(key, 1, len, fp);
2704 if (rc)
2705 return rc;
2706
2707 rc = ebitmap_write(&role->dominates, fp);
2708 if (rc)
2709 return rc;
2710
2711 rc = ebitmap_write(&role->types, fp);
2712 if (rc)
2713 return rc;
2714
2715 return 0;
2716}
2717
2718static int type_write(void *vkey, void *datum, void *ptr)
2719{
2720 char *key = vkey;
2721 struct type_datum *typdatum = datum;
2722 struct policy_data *pd = ptr;
2723 struct policydb *p = pd->p;
2724 void *fp = pd->fp;
2725 __le32 buf[4];
2726 int rc;
2727 size_t items, len;
2728
2729 len = strlen(key);
2730 items = 0;
2731 buf[items++] = cpu_to_le32(len);
2732 buf[items++] = cpu_to_le32(typdatum->value);
2733 if (p->policyvers >= POLICYDB_VERSION_BOUNDARY) {
2734 u32 properties = 0;
2735
2736 if (typdatum->primary)
2737 properties |= TYPEDATUM_PROPERTY_PRIMARY;
2738
2739 if (typdatum->attribute)
2740 properties |= TYPEDATUM_PROPERTY_ATTRIBUTE;
2741
2742 buf[items++] = cpu_to_le32(properties);
2743 buf[items++] = cpu_to_le32(typdatum->bounds);
2744 } else {
2745 buf[items++] = cpu_to_le32(typdatum->primary);
2746 }
2747 BUG_ON(items > (sizeof(buf) / sizeof(buf[0])));
2748 rc = put_entry(buf, sizeof(u32), items, fp);
2749 if (rc)
2750 return rc;
2751
2752 rc = put_entry(key, 1, len, fp);
2753 if (rc)
2754 return rc;
2755
2756 return 0;
2757}
2758
2759static int user_write(void *vkey, void *datum, void *ptr)
2760{
2761 char *key = vkey;
2762 struct user_datum *usrdatum = datum;
2763 struct policy_data *pd = ptr;
2764 struct policydb *p = pd->p;
2765 void *fp = pd->fp;
2766 __le32 buf[3];
2767 size_t items, len;
2768 int rc;
2769
2770 len = strlen(key);
2771 items = 0;
2772 buf[items++] = cpu_to_le32(len);
2773 buf[items++] = cpu_to_le32(usrdatum->value);
2774 if (p->policyvers >= POLICYDB_VERSION_BOUNDARY)
2775 buf[items++] = cpu_to_le32(usrdatum->bounds);
2776 BUG_ON(items > (sizeof(buf) / sizeof(buf[0])));
2777 rc = put_entry(buf, sizeof(u32), items, fp);
2778 if (rc)
2779 return rc;
2780
2781 rc = put_entry(key, 1, len, fp);
2782 if (rc)
2783 return rc;
2784
2785 rc = ebitmap_write(&usrdatum->roles, fp);
2786 if (rc)
2787 return rc;
2788
2789 rc = mls_write_range_helper(&usrdatum->range, fp);
2790 if (rc)
2791 return rc;
2792
2793 rc = mls_write_level(&usrdatum->dfltlevel, fp);
2794 if (rc)
2795 return rc;
2796
2797 return 0;
2798}
2799
2800static int (*write_f[SYM_NUM]) (void *key, void *datum,
2801 void *datap) =
2802{
2803 common_write,
2804 class_write,
2805 role_write,
2806 type_write,
2807 user_write,
2808 cond_write_bool,
2809 sens_write,
2810 cat_write,
2811};
2812
2813static int ocontext_write(struct policydb *p, struct policydb_compat_info *info,
2814 void *fp)
2815{
2816 unsigned int i, j, rc;
2817 size_t nel, len;
2818 __le32 buf[3];
2819 u32 nodebuf[8];
2820 struct ocontext *c;
2821 for (i = 0; i < info->ocon_num; i++) {
2822 nel = 0;
2823 for (c = p->ocontexts[i]; c; c = c->next)
2824 nel++;
2825 buf[0] = cpu_to_le32(nel);
2826 rc = put_entry(buf, sizeof(u32), 1, fp);
2827 if (rc)
2828 return rc;
2829 for (c = p->ocontexts[i]; c; c = c->next) {
2830 switch (i) {
2831 case OCON_ISID:
2832 buf[0] = cpu_to_le32(c->sid[0]);
2833 rc = put_entry(buf, sizeof(u32), 1, fp);
2834 if (rc)
2835 return rc;
2836 rc = context_write(p, &c->context[0], fp);
2837 if (rc)
2838 return rc;
2839 break;
2840 case OCON_FS:
2841 case OCON_NETIF:
2842 len = strlen(c->u.name);
2843 buf[0] = cpu_to_le32(len);
2844 rc = put_entry(buf, sizeof(u32), 1, fp);
2845 if (rc)
2846 return rc;
2847 rc = put_entry(c->u.name, 1, len, fp);
2848 if (rc)
2849 return rc;
2850 rc = context_write(p, &c->context[0], fp);
2851 if (rc)
2852 return rc;
2853 rc = context_write(p, &c->context[1], fp);
2854 if (rc)
2855 return rc;
2856 break;
2857 case OCON_PORT:
2858 buf[0] = cpu_to_le32(c->u.port.protocol);
2859 buf[1] = cpu_to_le32(c->u.port.low_port);
2860 buf[2] = cpu_to_le32(c->u.port.high_port);
2861 rc = put_entry(buf, sizeof(u32), 3, fp);
2862 if (rc)
2863 return rc;
2864 rc = context_write(p, &c->context[0], fp);
2865 if (rc)
2866 return rc;
2867 break;
2868 case OCON_NODE:
2869 nodebuf[0] = c->u.node.addr; /* network order */
2870 nodebuf[1] = c->u.node.mask; /* network order */
2871 rc = put_entry(nodebuf, sizeof(u32), 2, fp);
2872 if (rc)
2873 return rc;
2874 rc = context_write(p, &c->context[0], fp);
2875 if (rc)
2876 return rc;
2877 break;
2878 case OCON_FSUSE:
2879 buf[0] = cpu_to_le32(c->v.behavior);
2880 len = strlen(c->u.name);
2881 buf[1] = cpu_to_le32(len);
2882 rc = put_entry(buf, sizeof(u32), 2, fp);
2883 if (rc)
2884 return rc;
2885 rc = put_entry(c->u.name, 1, len, fp);
2886 if (rc)
2887 return rc;
2888 rc = context_write(p, &c->context[0], fp);
2889 if (rc)
2890 return rc;
2891 break;
2892 case OCON_NODE6:
2893 for (j = 0; j < 4; j++)
2894 nodebuf[j] = c->u.node6.addr[j]; /* network order */
2895 for (j = 0; j < 4; j++)
2896 nodebuf[j + 4] = c->u.node6.mask[j]; /* network order */
2897 rc = put_entry(nodebuf, sizeof(u32), 8, fp);
2898 if (rc)
2899 return rc;
2900 rc = context_write(p, &c->context[0], fp);
2901 if (rc)
2902 return rc;
2903 break;
2904 }
2905 }
2906 }
2907 return 0;
2908}
2909
2910static int genfs_write(struct policydb *p, void *fp)
2911{
2912 struct genfs *genfs;
2913 struct ocontext *c;
2914 size_t len;
2915 __le32 buf[1];
2916 int rc;
2917
2918 len = 0;
2919 for (genfs = p->genfs; genfs; genfs = genfs->next)
2920 len++;
2921 buf[0] = cpu_to_le32(len);
2922 rc = put_entry(buf, sizeof(u32), 1, fp);
2923 if (rc)
2924 return rc;
2925 for (genfs = p->genfs; genfs; genfs = genfs->next) {
2926 len = strlen(genfs->fstype);
2927 buf[0] = cpu_to_le32(len);
2928 rc = put_entry(buf, sizeof(u32), 1, fp);
2929 if (rc)
2930 return rc;
2931 rc = put_entry(genfs->fstype, 1, len, fp);
2932 if (rc)
2933 return rc;
2934 len = 0;
2935 for (c = genfs->head; c; c = c->next)
2936 len++;
2937 buf[0] = cpu_to_le32(len);
2938 rc = put_entry(buf, sizeof(u32), 1, fp);
2939 if (rc)
2940 return rc;
2941 for (c = genfs->head; c; c = c->next) {
2942 len = strlen(c->u.name);
2943 buf[0] = cpu_to_le32(len);
2944 rc = put_entry(buf, sizeof(u32), 1, fp);
2945 if (rc)
2946 return rc;
2947 rc = put_entry(c->u.name, 1, len, fp);
2948 if (rc)
2949 return rc;
2950 buf[0] = cpu_to_le32(c->v.sclass);
2951 rc = put_entry(buf, sizeof(u32), 1, fp);
2952 if (rc)
2953 return rc;
2954 rc = context_write(p, &c->context[0], fp);
2955 if (rc)
2956 return rc;
2957 }
2958 }
2959 return 0;
2960}
2961
2962static int range_count(void *key, void *data, void *ptr)
2963{
2964 int *cnt = ptr;
2965 *cnt = *cnt + 1;
2966
2967 return 0;
2968}
2969
2970static int range_write_helper(void *key, void *data, void *ptr)
2971{
2972 __le32 buf[2];
2973 struct range_trans *rt = key;
2974 struct mls_range *r = data;
2975 struct policy_data *pd = ptr;
2976 void *fp = pd->fp;
2977 struct policydb *p = pd->p;
2978 int rc;
2979
2980 buf[0] = cpu_to_le32(rt->source_type);
2981 buf[1] = cpu_to_le32(rt->target_type);
2982 rc = put_entry(buf, sizeof(u32), 2, fp);
2983 if (rc)
2984 return rc;
2985 if (p->policyvers >= POLICYDB_VERSION_RANGETRANS) {
2986 buf[0] = cpu_to_le32(rt->target_class);
2987 rc = put_entry(buf, sizeof(u32), 1, fp);
2988 if (rc)
2989 return rc;
2990 }
2991 rc = mls_write_range_helper(r, fp);
2992 if (rc)
2993 return rc;
2994
2995 return 0;
2996}
2997
2998static int range_write(struct policydb *p, void *fp)
2999{
3000 size_t nel;
3001 __le32 buf[1];
3002 int rc;
3003 struct policy_data pd;
3004
3005 pd.p = p;
3006 pd.fp = fp;
3007
3008 /* count the number of entries in the hashtab */
3009 nel = 0;
3010 rc = hashtab_map(p->range_tr, range_count, &nel);
3011 if (rc)
3012 return rc;
3013
3014 buf[0] = cpu_to_le32(nel);
3015 rc = put_entry(buf, sizeof(u32), 1, fp);
3016 if (rc)
3017 return rc;
3018
3019 /* actually write all of the entries */
3020 rc = hashtab_map(p->range_tr, range_write_helper, &pd);
3021 if (rc)
3022 return rc;
3023
3024 return 0;
3025}
3026
3027/*
3028 * Write the configuration data in a policy database
3029 * structure to a policy database binary representation
3030 * file.
3031 */
3032int policydb_write(struct policydb *p, void *fp)
3033{
3034 unsigned int i, num_syms;
3035 int rc;
3036 __le32 buf[4];
3037 u32 config;
3038 size_t len;
3039 struct policydb_compat_info *info;
3040
3041 /*
3042 * refuse to write policy older than compressed avtab
3043 * to simplify the writer. There are other tests dropped
3044 * since we assume this throughout the writer code. Be
3045 * careful if you ever try to remove this restriction
3046 */
3047 if (p->policyvers < POLICYDB_VERSION_AVTAB) {
3048 printk(KERN_ERR "SELinux: refusing to write policy version %d."
3049 " Because it is less than version %d\n", p->policyvers,
3050 POLICYDB_VERSION_AVTAB);
3051 return -EINVAL;
3052 }
3053
3054 config = 0;
3055 if (p->mls_enabled)
3056 config |= POLICYDB_CONFIG_MLS;
3057
3058 if (p->reject_unknown)
3059 config |= REJECT_UNKNOWN;
3060 if (p->allow_unknown)
3061 config |= ALLOW_UNKNOWN;
3062
3063 /* Write the magic number and string identifiers. */
3064 buf[0] = cpu_to_le32(POLICYDB_MAGIC);
3065 len = strlen(POLICYDB_STRING);
3066 buf[1] = cpu_to_le32(len);
3067 rc = put_entry(buf, sizeof(u32), 2, fp);
3068 if (rc)
3069 return rc;
3070 rc = put_entry(POLICYDB_STRING, 1, len, fp);
3071 if (rc)
3072 return rc;
3073
3074 /* Write the version, config, and table sizes. */
3075 info = policydb_lookup_compat(p->policyvers);
3076 if (!info) {
3077 printk(KERN_ERR "SELinux: compatibility lookup failed for policy "
3078 "version %d", p->policyvers);
3079 return rc;
3080 }
3081
3082 buf[0] = cpu_to_le32(p->policyvers);
3083 buf[1] = cpu_to_le32(config);
3084 buf[2] = cpu_to_le32(info->sym_num);
3085 buf[3] = cpu_to_le32(info->ocon_num);
3086
3087 rc = put_entry(buf, sizeof(u32), 4, fp);
3088 if (rc)
3089 return rc;
3090
3091 if (p->policyvers >= POLICYDB_VERSION_POLCAP) {
3092 rc = ebitmap_write(&p->policycaps, fp);
3093 if (rc)
3094 return rc;
3095 }
3096
3097 if (p->policyvers >= POLICYDB_VERSION_PERMISSIVE) {
3098 rc = ebitmap_write(&p->permissive_map, fp);
3099 if (rc)
3100 return rc;
3101 }
3102
3103 num_syms = info->sym_num;
3104 for (i = 0; i < num_syms; i++) {
3105 struct policy_data pd;
3106
3107 pd.fp = fp;
3108 pd.p = p;
3109
3110 buf[0] = cpu_to_le32(p->symtab[i].nprim);
3111 buf[1] = cpu_to_le32(p->symtab[i].table->nel);
3112
3113 rc = put_entry(buf, sizeof(u32), 2, fp);
3114 if (rc)
3115 return rc;
3116 rc = hashtab_map(p->symtab[i].table, write_f[i], &pd);
3117 if (rc)
3118 return rc;
3119 }
3120
3121 rc = avtab_write(p, &p->te_avtab, fp);
3122 if (rc)
3123 return rc;
3124
3125 rc = cond_write_list(p, p->cond_list, fp);
3126 if (rc)
3127 return rc;
3128
3129 rc = role_trans_write(p->role_tr, fp);
3130 if (rc)
3131 return rc;
3132
3133 rc = role_allow_write(p->role_allow, fp);
3134 if (rc)
3135 return rc;
3136
3137 rc = ocontext_write(p, info, fp);
3138 if (rc)
3139 return rc;
3140
3141 rc = genfs_write(p, fp);
3142 if (rc)
3143 return rc;
3144
3145 rc = range_write(p, fp);
3146 if (rc)
3147 return rc;
3148
3149 for (i = 0; i < p->p_types.nprim; i++) {
3150 struct ebitmap *e = flex_array_get(p->type_attr_map_array, i);
3151
3152 BUG_ON(!e);
3153 rc = ebitmap_write(e, fp);
3154 if (rc)
3155 return rc;
3156 }
3157
3158 return 0;
3159}
diff --git a/security/selinux/ss/policydb.h b/security/selinux/ss/policydb.h
index 310e94442cb8..95d3d7de361e 100644
--- a/security/selinux/ss/policydb.h
+++ b/security/selinux/ss/policydb.h
@@ -254,6 +254,9 @@ struct policydb {
254 254
255 struct ebitmap permissive_map; 255 struct ebitmap permissive_map;
256 256
257 /* length of this policy when it was loaded */
258 size_t len;
259
257 unsigned int policyvers; 260 unsigned int policyvers;
258 261
259 unsigned int reject_unknown : 1; 262 unsigned int reject_unknown : 1;
@@ -270,6 +273,7 @@ extern int policydb_class_isvalid(struct policydb *p, unsigned int class);
270extern int policydb_type_isvalid(struct policydb *p, unsigned int type); 273extern int policydb_type_isvalid(struct policydb *p, unsigned int type);
271extern int policydb_role_isvalid(struct policydb *p, unsigned int role); 274extern int policydb_role_isvalid(struct policydb *p, unsigned int role);
272extern int policydb_read(struct policydb *p, void *fp); 275extern int policydb_read(struct policydb *p, void *fp);
276extern int policydb_write(struct policydb *p, void *fp);
273 277
274#define PERM_SYMTAB_SIZE 32 278#define PERM_SYMTAB_SIZE 32
275 279
@@ -290,6 +294,11 @@ struct policy_file {
290 size_t len; 294 size_t len;
291}; 295};
292 296
297struct policy_data {
298 struct policydb *p;
299 void *fp;
300};
301
293static inline int next_entry(void *buf, struct policy_file *fp, size_t bytes) 302static inline int next_entry(void *buf, struct policy_file *fp, size_t bytes)
294{ 303{
295 if (bytes > fp->len) 304 if (bytes > fp->len)
@@ -301,6 +310,17 @@ static inline int next_entry(void *buf, struct policy_file *fp, size_t bytes)
301 return 0; 310 return 0;
302} 311}
303 312
313static inline int put_entry(void *buf, size_t bytes, int num, struct policy_file *fp)
314{
315 size_t len = bytes * num;
316
317 memcpy(fp->data, buf, len);
318 fp->data += len;
319 fp->len -= len;
320
321 return 0;
322}
323
304extern u16 string_to_security_class(struct policydb *p, const char *name); 324extern u16 string_to_security_class(struct policydb *p, const char *name);
305extern u32 string_to_av_perm(struct policydb *p, u16 tclass, const char *name); 325extern u32 string_to_av_perm(struct policydb *p, u16 tclass, const char *name);
306 326
diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c
index 9ea2feca3cd4..223c1ff6ef23 100644
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c
@@ -51,6 +51,7 @@
51#include <linux/mutex.h> 51#include <linux/mutex.h>
52#include <linux/selinux.h> 52#include <linux/selinux.h>
53#include <linux/flex_array.h> 53#include <linux/flex_array.h>
54#include <linux/vmalloc.h>
54#include <net/netlabel.h> 55#include <net/netlabel.h>
55 56
56#include "flask.h" 57#include "flask.h"
@@ -991,7 +992,8 @@ static int context_struct_to_string(struct context *context, char **scontext, u3
991{ 992{
992 char *scontextp; 993 char *scontextp;
993 994
994 *scontext = NULL; 995 if (scontext)
996 *scontext = NULL;
995 *scontext_len = 0; 997 *scontext_len = 0;
996 998
997 if (context->len) { 999 if (context->len) {
@@ -1008,6 +1010,9 @@ static int context_struct_to_string(struct context *context, char **scontext, u3
1008 *scontext_len += strlen(policydb.p_type_val_to_name[context->type - 1]) + 1; 1010 *scontext_len += strlen(policydb.p_type_val_to_name[context->type - 1]) + 1;
1009 *scontext_len += mls_compute_context_len(context); 1011 *scontext_len += mls_compute_context_len(context);
1010 1012
1013 if (!scontext)
1014 return 0;
1015
1011 /* Allocate space for the context; caller must free this space. */ 1016 /* Allocate space for the context; caller must free this space. */
1012 scontextp = kmalloc(*scontext_len, GFP_ATOMIC); 1017 scontextp = kmalloc(*scontext_len, GFP_ATOMIC);
1013 if (!scontextp) 1018 if (!scontextp)
@@ -1047,7 +1052,8 @@ static int security_sid_to_context_core(u32 sid, char **scontext,
1047 struct context *context; 1052 struct context *context;
1048 int rc = 0; 1053 int rc = 0;
1049 1054
1050 *scontext = NULL; 1055 if (scontext)
1056 *scontext = NULL;
1051 *scontext_len = 0; 1057 *scontext_len = 0;
1052 1058
1053 if (!ss_initialized) { 1059 if (!ss_initialized) {
@@ -1055,6 +1061,8 @@ static int security_sid_to_context_core(u32 sid, char **scontext,
1055 char *scontextp; 1061 char *scontextp;
1056 1062
1057 *scontext_len = strlen(initial_sid_to_string[sid]) + 1; 1063 *scontext_len = strlen(initial_sid_to_string[sid]) + 1;
1064 if (!scontext)
1065 goto out;
1058 scontextp = kmalloc(*scontext_len, GFP_ATOMIC); 1066 scontextp = kmalloc(*scontext_len, GFP_ATOMIC);
1059 if (!scontextp) { 1067 if (!scontextp) {
1060 rc = -ENOMEM; 1068 rc = -ENOMEM;
@@ -1769,6 +1777,7 @@ int security_load_policy(void *data, size_t len)
1769 return rc; 1777 return rc;
1770 } 1778 }
1771 1779
1780 policydb.len = len;
1772 rc = selinux_set_mapping(&policydb, secclass_map, 1781 rc = selinux_set_mapping(&policydb, secclass_map,
1773 &current_mapping, 1782 &current_mapping,
1774 &current_mapping_size); 1783 &current_mapping_size);
@@ -1791,6 +1800,7 @@ int security_load_policy(void *data, size_t len)
1791 selinux_complete_init(); 1800 selinux_complete_init();
1792 avc_ss_reset(seqno); 1801 avc_ss_reset(seqno);
1793 selnl_notify_policyload(seqno); 1802 selnl_notify_policyload(seqno);
1803 selinux_status_update_policyload(seqno);
1794 selinux_netlbl_cache_invalidate(); 1804 selinux_netlbl_cache_invalidate();
1795 selinux_xfrm_notify_policyload(); 1805 selinux_xfrm_notify_policyload();
1796 return 0; 1806 return 0;
@@ -1804,6 +1814,7 @@ int security_load_policy(void *data, size_t len)
1804 if (rc) 1814 if (rc)
1805 return rc; 1815 return rc;
1806 1816
1817 newpolicydb.len = len;
1807 /* If switching between different policy types, log MLS status */ 1818 /* If switching between different policy types, log MLS status */
1808 if (policydb.mls_enabled && !newpolicydb.mls_enabled) 1819 if (policydb.mls_enabled && !newpolicydb.mls_enabled)
1809 printk(KERN_INFO "SELinux: Disabling MLS support...\n"); 1820 printk(KERN_INFO "SELinux: Disabling MLS support...\n");
@@ -1870,6 +1881,7 @@ int security_load_policy(void *data, size_t len)
1870 1881
1871 avc_ss_reset(seqno); 1882 avc_ss_reset(seqno);
1872 selnl_notify_policyload(seqno); 1883 selnl_notify_policyload(seqno);
1884 selinux_status_update_policyload(seqno);
1873 selinux_netlbl_cache_invalidate(); 1885 selinux_netlbl_cache_invalidate();
1874 selinux_xfrm_notify_policyload(); 1886 selinux_xfrm_notify_policyload();
1875 1887
@@ -1883,6 +1895,17 @@ err:
1883 1895
1884} 1896}
1885 1897
1898size_t security_policydb_len(void)
1899{
1900 size_t len;
1901
1902 read_lock(&policy_rwlock);
1903 len = policydb.len;
1904 read_unlock(&policy_rwlock);
1905
1906 return len;
1907}
1908
1886/** 1909/**
1887 * security_port_sid - Obtain the SID for a port. 1910 * security_port_sid - Obtain the SID for a port.
1888 * @protocol: protocol number 1911 * @protocol: protocol number
@@ -2374,6 +2397,7 @@ out:
2374 if (!rc) { 2397 if (!rc) {
2375 avc_ss_reset(seqno); 2398 avc_ss_reset(seqno);
2376 selnl_notify_policyload(seqno); 2399 selnl_notify_policyload(seqno);
2400 selinux_status_update_policyload(seqno);
2377 selinux_xfrm_notify_policyload(); 2401 selinux_xfrm_notify_policyload();
2378 } 2402 }
2379 return rc; 2403 return rc;
@@ -3129,3 +3153,38 @@ netlbl_sid_to_secattr_failure:
3129 return rc; 3153 return rc;
3130} 3154}
3131#endif /* CONFIG_NETLABEL */ 3155#endif /* CONFIG_NETLABEL */
3156
3157/**
3158 * security_read_policy - read the policy.
3159 * @data: binary policy data
3160 * @len: length of data in bytes
3161 *
3162 */
3163int security_read_policy(void **data, ssize_t *len)
3164{
3165 int rc;
3166 struct policy_file fp;
3167
3168 if (!ss_initialized)
3169 return -EINVAL;
3170
3171 *len = security_policydb_len();
3172
3173 *data = vmalloc_user(*len);
3174 if (!*data)
3175 return -ENOMEM;
3176
3177 fp.data = *data;
3178 fp.len = *len;
3179
3180 read_lock(&policy_rwlock);
3181 rc = policydb_write(&policydb, &fp);
3182 read_unlock(&policy_rwlock);
3183
3184 if (rc)
3185 return rc;
3186
3187 *len = (unsigned long)fp.data - (unsigned long)*data;
3188 return 0;
3189
3190}
diff --git a/security/selinux/ss/status.c b/security/selinux/ss/status.c
new file mode 100644
index 000000000000..d982365f9d1a
--- /dev/null
+++ b/security/selinux/ss/status.c
@@ -0,0 +1,126 @@
1/*
2 * mmap based event notifications for SELinux
3 *
4 * Author: KaiGai Kohei <kaigai@ak.jp.nec.com>
5 *
6 * Copyright (C) 2010 NEC corporation
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2,
10 * as published by the Free Software Foundation.
11 */
12#include <linux/kernel.h>
13#include <linux/gfp.h>
14#include <linux/mm.h>
15#include <linux/mutex.h>
16#include "avc.h"
17#include "services.h"
18
19/*
20 * The selinux_status_page shall be exposed to userspace applications
21 * using mmap interface on /selinux/status.
22 * It enables to notify applications a few events that will cause reset
23 * of userspace access vector without context switching.
24 *
25 * The selinux_kernel_status structure on the head of status page is
26 * protected from concurrent accesses using seqlock logic, so userspace
27 * application should reference the status page according to the seqlock
28 * logic.
29 *
30 * Typically, application checks status->sequence at the head of access
31 * control routine. If it is odd-number, kernel is updating the status,
32 * so please wait for a moment. If it is changed from the last sequence
33 * number, it means something happen, so application will reset userspace
34 * avc, if needed.
35 * In most cases, application shall confirm the kernel status is not
36 * changed without any system call invocations.
37 */
38static struct page *selinux_status_page;
39static DEFINE_MUTEX(selinux_status_lock);
40
41/*
42 * selinux_kernel_status_page
43 *
44 * It returns a reference to selinux_status_page. If the status page is
45 * not allocated yet, it also tries to allocate it at the first time.
46 */
47struct page *selinux_kernel_status_page(void)
48{
49 struct selinux_kernel_status *status;
50 struct page *result = NULL;
51
52 mutex_lock(&selinux_status_lock);
53 if (!selinux_status_page) {
54 selinux_status_page = alloc_page(GFP_KERNEL|__GFP_ZERO);
55
56 if (selinux_status_page) {
57 status = page_address(selinux_status_page);
58
59 status->version = SELINUX_KERNEL_STATUS_VERSION;
60 status->sequence = 0;
61 status->enforcing = selinux_enforcing;
62 /*
63 * NOTE: the next policyload event shall set
64 * a positive value on the status->policyload,
65 * although it may not be 1, but never zero.
66 * So, application can know it was updated.
67 */
68 status->policyload = 0;
69 status->deny_unknown = !security_get_allow_unknown();
70 }
71 }
72 result = selinux_status_page;
73 mutex_unlock(&selinux_status_lock);
74
75 return result;
76}
77
78/*
79 * selinux_status_update_setenforce
80 *
81 * It updates status of the current enforcing/permissive mode.
82 */
83void selinux_status_update_setenforce(int enforcing)
84{
85 struct selinux_kernel_status *status;
86
87 mutex_lock(&selinux_status_lock);
88 if (selinux_status_page) {
89 status = page_address(selinux_status_page);
90
91 status->sequence++;
92 smp_wmb();
93
94 status->enforcing = enforcing;
95
96 smp_wmb();
97 status->sequence++;
98 }
99 mutex_unlock(&selinux_status_lock);
100}
101
102/*
103 * selinux_status_update_policyload
104 *
105 * It updates status of the times of policy reloaded, and current
106 * setting of deny_unknown.
107 */
108void selinux_status_update_policyload(int seqno)
109{
110 struct selinux_kernel_status *status;
111
112 mutex_lock(&selinux_status_lock);
113 if (selinux_status_page) {
114 status = page_address(selinux_status_page);
115
116 status->sequence++;
117 smp_wmb();
118
119 status->policyload = seqno;
120 status->deny_unknown = !security_get_allow_unknown();
121
122 smp_wmb();
123 status->sequence++;
124 }
125 mutex_unlock(&selinux_status_lock);
126}
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index c448d57ae2b7..bc39f4067af6 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -1281,12 +1281,11 @@ static int smack_task_getioprio(struct task_struct *p)
1281 * 1281 *
1282 * Return 0 if read access is permitted 1282 * Return 0 if read access is permitted
1283 */ 1283 */
1284static int smack_task_setscheduler(struct task_struct *p, int policy, 1284static int smack_task_setscheduler(struct task_struct *p)
1285 struct sched_param *lp)
1286{ 1285{
1287 int rc; 1286 int rc;
1288 1287
1289 rc = cap_task_setscheduler(p, policy, lp); 1288 rc = cap_task_setscheduler(p);
1290 if (rc == 0) 1289 if (rc == 0)
1291 rc = smk_curacc_on_task(p, MAY_WRITE); 1290 rc = smk_curacc_on_task(p, MAY_WRITE);
1292 return rc; 1291 return rc;
@@ -3005,7 +3004,8 @@ static int smack_secid_to_secctx(u32 secid, char **secdata, u32 *seclen)
3005{ 3004{
3006 char *sp = smack_from_secid(secid); 3005 char *sp = smack_from_secid(secid);
3007 3006
3008 *secdata = sp; 3007 if (secdata)
3008 *secdata = sp;
3009 *seclen = strlen(sp); 3009 *seclen = strlen(sp);
3010 return 0; 3010 return 0;
3011} 3011}
diff --git a/security/tomoyo/common.c b/security/tomoyo/common.c
index c668b447c725..7556315c1978 100644
--- a/security/tomoyo/common.c
+++ b/security/tomoyo/common.c
@@ -768,8 +768,10 @@ static bool tomoyo_select_one(struct tomoyo_io_buffer *head, const char *data)
768 return true; /* Do nothing if open(O_WRONLY). */ 768 return true; /* Do nothing if open(O_WRONLY). */
769 memset(&head->r, 0, sizeof(head->r)); 769 memset(&head->r, 0, sizeof(head->r));
770 head->r.print_this_domain_only = true; 770 head->r.print_this_domain_only = true;
771 head->r.eof = !domain; 771 if (domain)
772 head->r.domain = &domain->list; 772 head->r.domain = &domain->list;
773 else
774 head->r.eof = 1;
773 tomoyo_io_printf(head, "# select %s\n", data); 775 tomoyo_io_printf(head, "# select %s\n", data);
774 if (domain && domain->is_deleted) 776 if (domain && domain->is_deleted)
775 tomoyo_io_printf(head, "# This is a deleted domain.\n"); 777 tomoyo_io_printf(head, "# This is a deleted domain.\n");
@@ -2051,13 +2053,22 @@ void tomoyo_check_profile(void)
2051 const u8 profile = domain->profile; 2053 const u8 profile = domain->profile;
2052 if (tomoyo_profile_ptr[profile]) 2054 if (tomoyo_profile_ptr[profile])
2053 continue; 2055 continue;
2056 printk(KERN_ERR "You need to define profile %u before using it.\n",
2057 profile);
2058 printk(KERN_ERR "Please see http://tomoyo.sourceforge.jp/2.3/ "
2059 "for more information.\n");
2054 panic("Profile %u (used by '%s') not defined.\n", 2060 panic("Profile %u (used by '%s') not defined.\n",
2055 profile, domain->domainname->name); 2061 profile, domain->domainname->name);
2056 } 2062 }
2057 tomoyo_read_unlock(idx); 2063 tomoyo_read_unlock(idx);
2058 if (tomoyo_profile_version != 20090903) 2064 if (tomoyo_profile_version != 20090903) {
2065 printk(KERN_ERR "You need to install userland programs for "
2066 "TOMOYO 2.3 and initialize policy configuration.\n");
2067 printk(KERN_ERR "Please see http://tomoyo.sourceforge.jp/2.3/ "
2068 "for more information.\n");
2059 panic("Profile version %u is not supported.\n", 2069 panic("Profile version %u is not supported.\n",
2060 tomoyo_profile_version); 2070 tomoyo_profile_version);
2071 }
2061 printk(KERN_INFO "TOMOYO: 2.3.0\n"); 2072 printk(KERN_INFO "TOMOYO: 2.3.0\n");
2062 printk(KERN_INFO "Mandatory Access Control activated.\n"); 2073 printk(KERN_INFO "Mandatory Access Control activated.\n");
2063} 2074}