aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/testing/sysfs-bus-usb2
-rw-r--r--Documentation/DMA-API-HOWTO.txt (renamed from Documentation/PCI/PCI-DMA-mapping.txt)0
-rw-r--r--Documentation/cgroups/memory.txt2
-rw-r--r--Documentation/circular-buffers.txt234
-rw-r--r--Documentation/filesystems/ceph.txt139
-rw-r--r--Documentation/filesystems/tmpfs.txt6
-rw-r--r--Documentation/ioctl/ioctl-number.txt1
-rw-r--r--Documentation/kobject.txt60
-rw-r--r--Documentation/memory-barriers.txt20
-rw-r--r--Documentation/volatile-considered-harmful.txt6
-rw-r--r--MAINTAINERS49
-rw-r--r--Makefile2
-rw-r--r--arch/alpha/include/asm/core_marvel.h1
-rw-r--r--arch/alpha/include/asm/core_mcpcia.h1
-rw-r--r--arch/alpha/include/asm/core_titan.h1
-rw-r--r--arch/alpha/include/asm/core_tsunami.h1
-rw-r--r--arch/alpha/kernel/sys_dp264.c2
-rw-r--r--arch/alpha/kernel/sys_titan.c2
-rw-r--r--arch/alpha/kernel/traps.c10
-rw-r--r--arch/arm/common/locomo.c10
-rw-r--r--arch/arm/mach-ixp23xx/include/mach/memory.h2
-rw-r--r--arch/arm/mach-kirkwood/mv88f6281gtw_ge-setup.c4
-rw-r--r--arch/arm/mach-mmp/include/mach/uncompress.h5
-rw-r--r--arch/arm/mach-orion5x/wrt350n-v2-setup.c2
-rw-r--r--arch/arm/mach-pxa/Kconfig11
-rw-r--r--arch/arm/mach-pxa/imote2.c4
-rw-r--r--arch/arm/mach-pxa/include/mach/uncompress.h11
-rw-r--r--arch/arm/mach-pxa/raumfeld.c4
-rw-r--r--arch/arm/mach-pxa/stargate2.c5
-rw-r--r--arch/arm/tools/mach-types75
-rw-r--r--arch/powerpc/Kconfig13
-rw-r--r--arch/powerpc/include/asm/ppc-opcode.h6
-rw-r--r--arch/powerpc/include/asm/syscall.h6
-rw-r--r--arch/powerpc/kernel/head_fsl_booke.S7
-rw-r--r--arch/powerpc/kernel/iommu.c7
-rw-r--r--arch/powerpc/kernel/setup_32.c6
-rw-r--r--arch/powerpc/kernel/setup_64.c6
-rw-r--r--arch/powerpc/mm/mem.c6
-rw-r--r--arch/s390/boot/compressed/misc.c8
-rw-r--r--arch/s390/include/asm/system.h9
-rw-r--r--arch/s390/kernel/head.S3
-rw-r--r--arch/s390/kernel/head64.S2
-rw-r--r--arch/s390/kernel/setup.c4
-rw-r--r--arch/s390/kernel/smp.c6
-rw-r--r--arch/s390/mm/maccess.c26
-rw-r--r--arch/sh/boards/mach-ecovec24/setup.c2
-rw-r--r--arch/sh/boards/mach-se/7724/setup.c8
-rw-r--r--arch/sh/configs/ecovec24_defconfig236
-rw-r--r--arch/sh/include/asm/clkdev.h35
-rw-r--r--arch/sh/include/asm/clock.h7
-rw-r--r--arch/sh/include/asm/dmaengine.h63
-rw-r--r--arch/sh/include/asm/elf.h6
-rw-r--r--arch/sh/include/asm/mmu.h7
-rw-r--r--arch/sh/include/asm/siu.h8
-rw-r--r--arch/sh/include/cpu-sh4/cpu/mmu_context.h11
-rw-r--r--arch/sh/include/cpu-sh4/cpu/watchdog.h6
-rw-r--r--arch/sh/kernel/Makefile2
-rw-r--r--arch/sh/kernel/clkdev.c169
-rw-r--r--arch/sh/kernel/cpu/clock-cpg.c5
-rw-r--r--arch/sh/kernel/cpu/clock.c55
-rw-r--r--arch/sh/kernel/cpu/sh2/setup-sh7619.c6
-rw-r--r--arch/sh/kernel/cpu/sh2a/setup-mxg.c9
-rw-r--r--arch/sh/kernel/cpu/sh2a/setup-sh7201.c9
-rw-r--r--arch/sh/kernel/cpu/sh2a/setup-sh7203.c12
-rw-r--r--arch/sh/kernel/cpu/sh2a/setup-sh7206.c15
-rw-r--r--arch/sh/kernel/cpu/sh3/setup-sh7705.c9
-rw-r--r--arch/sh/kernel/cpu/sh3/setup-sh770x.c9
-rw-r--r--arch/sh/kernel/cpu/sh3/setup-sh7710.c9
-rw-r--r--arch/sh/kernel/cpu/sh3/setup-sh7720.c24
-rw-r--r--arch/sh/kernel/cpu/sh4/setup-sh4-202.c9
-rw-r--r--arch/sh/kernel/cpu/sh4/setup-sh7750.c15
-rw-r--r--arch/sh/kernel/cpu/sh4/setup-sh7760.c9
-rw-r--r--arch/sh/kernel/cpu/sh4a/clock-sh7343.c12
-rw-r--r--arch/sh/kernel/cpu/sh4a/clock-sh7366.c10
-rw-r--r--arch/sh/kernel/cpu/sh4a/clock-sh7722.c10
-rw-r--r--arch/sh/kernel/cpu/sh4a/clock-sh7723.c58
-rw-r--r--arch/sh/kernel/cpu/sh4a/clock-sh7724.c56
-rw-r--r--arch/sh/kernel/cpu/sh4a/clock-sh7785.c51
-rw-r--r--arch/sh/kernel/cpu/sh4a/clock-sh7786.c80
-rw-r--r--arch/sh/kernel/cpu/sh4a/setup-sh7343.c16
-rw-r--r--arch/sh/kernel/cpu/sh4a/setup-sh7366.c13
-rw-r--r--arch/sh/kernel/cpu/sh4a/setup-sh7722.c15
-rw-r--r--arch/sh/kernel/cpu/sh4a/setup-sh7723.c27
-rw-r--r--arch/sh/kernel/cpu/sh4a/setup-sh7724.c27
-rw-r--r--arch/sh/kernel/cpu/sh4a/setup-sh7757.c6
-rw-r--r--arch/sh/kernel/cpu/sh4a/setup-sh7763.c18
-rw-r--r--arch/sh/kernel/cpu/sh4a/setup-sh7770.c27
-rw-r--r--arch/sh/kernel/cpu/sh4a/setup-sh7780.c18
-rw-r--r--arch/sh/kernel/cpu/sh4a/setup-sh7785.c24
-rw-r--r--arch/sh/kernel/cpu/sh4a/setup-sh7786.c36
-rw-r--r--arch/sh/kernel/cpu/sh4a/setup-shx3.c18
-rw-r--r--arch/sh/kernel/cpu/sh5/setup-sh5.c9
-rw-r--r--arch/sh/kernel/cpufreq.c4
-rw-r--r--arch/sh/kernel/dwarf.c4
-rw-r--r--arch/sh/kernel/idle.c2
-rw-r--r--arch/sh/kernel/perf_event.c2
-rw-r--r--arch/sh/kernel/process_64.c7
-rw-r--r--arch/sh/kernel/smp.c1
-rw-r--r--arch/sh/mm/Makefile8
-rw-r--r--arch/sh/mm/pmb.c4
-rw-r--r--arch/sh/mm/tlb-debugfs.c179
-rw-r--r--arch/sh/mm/tlb-pteaex.c2
-rw-r--r--arch/sh/mm/tlb-urb.c22
-rw-r--r--arch/sh/mm/tlbflush_32.c21
-rw-r--r--arch/sh/mm/uncached.c9
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c8
-rw-r--r--drivers/acpi/scan.c38
-rw-r--r--drivers/ata/libata-sff.c43
-rw-r--r--drivers/ata/pata_via.c1
-rw-r--r--drivers/base/class.c2
-rw-r--r--drivers/base/core.c6
-rw-r--r--drivers/base/cpu.c16
-rw-r--r--drivers/base/firmware_class.c2
-rw-r--r--drivers/base/node.c7
-rw-r--r--drivers/base/platform.c53
-rw-r--r--drivers/char/agp/intel-agp.c34
-rw-r--r--drivers/char/hvc_console.c31
-rw-r--r--drivers/char/ipmi/ipmi_msghandler.c10
-rw-r--r--drivers/char/tty_buffer.c4
-rw-r--r--drivers/char/tty_port.c2
-rw-r--r--drivers/char/virtio_console.c15
-rw-r--r--drivers/char/vt_ioctl.c39
-rw-r--r--drivers/clocksource/sh_cmt.c42
-rw-r--r--drivers/clocksource/sh_mtu2.c34
-rw-r--r--drivers/clocksource/sh_tmu.c38
-rw-r--r--drivers/dma/shdma.c13
-rw-r--r--drivers/dma/shdma.h4
-rw-r--r--drivers/edac/edac_mce_amd.c7
-rw-r--r--drivers/gpio/max730x.c4
-rw-r--r--drivers/gpu/drm/i915/i915_dma.c46
-rw-r--r--drivers/gpu/drm/i915/i915_drv.c4
-rw-r--r--drivers/gpu/drm/i915/i915_drv.h4
-rw-r--r--drivers/gpu/drm/i915/i915_gem.c31
-rw-r--r--drivers/gpu/drm/i915/i915_gem_tiling.c7
-rw-r--r--drivers/gpu/drm/i915/i915_reg.h14
-rw-r--r--drivers/gpu/drm/i915/intel_bios.c5
-rw-r--r--drivers/gpu/drm/i915/intel_display.c16
-rw-r--r--drivers/gpu/drm/i915/intel_lvds.c52
-rw-r--r--drivers/gpu/drm/i915/intel_overlay.c13
-rw-r--r--drivers/i2c/busses/i2c-scmi.c32
-rw-r--r--drivers/infiniband/core/sysfs.c1
-rw-r--r--drivers/misc/c2port/core.c4
-rw-r--r--drivers/mmc/core/mmc.c3
-rw-r--r--drivers/mtd/maps/omap_nor.c0
-rw-r--r--drivers/net/arm/ks8695net.c24
-rw-r--r--drivers/net/igb/e1000_82575.c1
-rw-r--r--drivers/net/igb/e1000_hw.h1
-rw-r--r--drivers/net/igb/igb_main.c1
-rw-r--r--drivers/net/ixgbe/ixgbe_82599.c78
-rw-r--r--drivers/net/ixgbe/ixgbe_fcoe.c6
-rw-r--r--drivers/net/ixgbe/ixgbe_main.c22
-rw-r--r--drivers/net/ixgbe/ixgbe_type.h1
-rw-r--r--drivers/net/ixgbevf/ethtool.c42
-rw-r--r--drivers/net/ixgbevf/ixgbevf_main.c74
-rw-r--r--drivers/net/ixgbevf/vf.h6
-rw-r--r--drivers/net/jme.c35
-rw-r--r--drivers/net/jme.h2
-rw-r--r--drivers/net/ks8851.c1
-rw-r--r--drivers/net/mlx4/main.c1
-rw-r--r--drivers/net/usb/smsc95xx.c18
-rw-r--r--drivers/net/wireless/ath/ath9k/xmit.c21
-rw-r--r--drivers/net/wireless/iwlwifi/iwl-tx.c2
-rw-r--r--drivers/net/wireless/wl12xx/wl1251_debugfs.c3
-rw-r--r--drivers/pci/pci-sysfs.c2
-rw-r--r--drivers/regulator/core.c1
-rw-r--r--drivers/regulator/lp3971.c10
-rw-r--r--drivers/regulator/max1586.c2
-rw-r--r--drivers/regulator/max8649.c3
-rw-r--r--drivers/regulator/max8660.c2
-rw-r--r--drivers/regulator/max8925-regulator.c6
-rw-r--r--drivers/rtc/rtc-mc13783.c23
-rw-r--r--drivers/s390/block/dasd_3990_erp.c7
-rw-r--r--drivers/s390/block/dasd_eckd.c4
-rw-r--r--drivers/s390/char/sclp_async.c1
-rw-r--r--drivers/s390/char/sclp_cmd.c14
-rw-r--r--drivers/s390/char/zcore.c31
-rw-r--r--drivers/serial/cpm_uart/cpm_uart_cpm2.c4
-rw-r--r--drivers/serial/sh-sci.c194
-rw-r--r--drivers/serial/sh-sci.h35
-rw-r--r--drivers/sh/intc.c31
-rw-r--r--drivers/usb/class/cdc-acm.c2
-rw-r--r--drivers/usb/class/cdc-wdm.c134
-rw-r--r--drivers/usb/core/devio.c17
-rw-r--r--drivers/usb/core/urb.c1
-rw-r--r--drivers/usb/gadget/Kconfig2
-rw-r--r--drivers/usb/gadget/epautoconf.c2
-rw-r--r--drivers/usb/gadget/f_mass_storage.c3
-rw-r--r--drivers/usb/gadget/gadget_chips.h8
-rw-r--r--drivers/usb/gadget/goku_udc.c2
-rw-r--r--drivers/usb/gadget/multi.c2
-rw-r--r--drivers/usb/host/Makefile4
-rw-r--r--drivers/usb/host/ehci-hcd.c2
-rw-r--r--drivers/usb/host/ehci-sched.c28
-rw-r--r--drivers/usb/host/ehci.h5
-rw-r--r--drivers/usb/host/r8a66597-hcd.c16
-rw-r--r--drivers/usb/host/xhci-mem.c9
-rw-r--r--drivers/usb/host/xhci.c (renamed from drivers/usb/host/xhci-hcd.c)1
-rw-r--r--drivers/usb/musb/musb_core.c13
-rw-r--r--drivers/usb/musb/musb_core.h4
-rw-r--r--drivers/usb/musb/musb_host.c2
-rw-r--r--drivers/usb/musb/musb_regs.h28
-rw-r--r--drivers/usb/serial/Kconfig4
-rw-r--r--drivers/usb/serial/console.c1
-rw-r--r--drivers/usb/serial/cp210x.c5
-rw-r--r--drivers/usb/serial/ftdi_sio.c7
-rw-r--r--drivers/usb/serial/ftdi_sio_ids.h7
-rw-r--r--drivers/usb/serial/generic.c49
-rw-r--r--drivers/usb/serial/option.c53
-rw-r--r--drivers/usb/serial/qcserial.c29
-rw-r--r--drivers/usb/storage/unusual_devs.h23
-rw-r--r--drivers/uwb/hwa-rc.c2
-rw-r--r--drivers/uwb/i1480/dfu/usb.c12
-rw-r--r--drivers/uwb/wlp/messages.c106
-rw-r--r--drivers/vhost/net.c10
-rw-r--r--drivers/vhost/vhost.c18
-rw-r--r--drivers/video/geode/lxfb.h2
-rw-r--r--drivers/video/geode/lxfb_ops.c10
-rw-r--r--drivers/video/omap2/displays/panel-generic.c22
-rw-r--r--drivers/video/omap2/dss/dss.c3
-rw-r--r--drivers/video/omap2/vram.c11
-rw-r--r--drivers/video/pxa168fb.c2
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/Makefile1
-rw-r--r--fs/afs/security.c5
-rw-r--r--fs/binfmt_aout.c14
-rw-r--r--fs/binfmt_elf_fdpic.c2
-rw-r--r--fs/ceph/Kconfig27
-rw-r--r--fs/ceph/Makefile39
-rw-r--r--fs/ceph/README20
-rw-r--r--fs/ceph/addr.c1188
-rw-r--r--fs/ceph/armor.c99
-rw-r--r--fs/ceph/auth.c257
-rw-r--r--fs/ceph/auth.h84
-rw-r--r--fs/ceph/auth_none.c121
-rw-r--r--fs/ceph/auth_none.h28
-rw-r--r--fs/ceph/auth_x.c656
-rw-r--r--fs/ceph/auth_x.h49
-rw-r--r--fs/ceph/auth_x_protocol.h90
-rw-r--r--fs/ceph/buffer.c78
-rw-r--r--fs/ceph/buffer.h39
-rw-r--r--fs/ceph/caps.c2927
-rw-r--r--fs/ceph/ceph_debug.h37
-rw-r--r--fs/ceph/ceph_frag.c21
-rw-r--r--fs/ceph/ceph_frag.h109
-rw-r--r--fs/ceph/ceph_fs.c74
-rw-r--r--fs/ceph/ceph_fs.h650
-rw-r--r--fs/ceph/ceph_hash.c118
-rw-r--r--fs/ceph/ceph_hash.h13
-rw-r--r--fs/ceph/ceph_strings.c176
-rw-r--r--fs/ceph/crush/crush.c151
-rw-r--r--fs/ceph/crush/crush.h180
-rw-r--r--fs/ceph/crush/hash.c149
-rw-r--r--fs/ceph/crush/hash.h17
-rw-r--r--fs/ceph/crush/mapper.c596
-rw-r--r--fs/ceph/crush/mapper.h20
-rw-r--r--fs/ceph/crypto.c408
-rw-r--r--fs/ceph/crypto.h48
-rw-r--r--fs/ceph/debugfs.c483
-rw-r--r--fs/ceph/decode.h194
-rw-r--r--fs/ceph/dir.c1220
-rw-r--r--fs/ceph/export.c223
-rw-r--r--fs/ceph/file.c937
-rw-r--r--fs/ceph/inode.c1750
-rw-r--r--fs/ceph/ioctl.c160
-rw-r--r--fs/ceph/ioctl.h40
-rw-r--r--fs/ceph/mds_client.c3021
-rw-r--r--fs/ceph/mds_client.h335
-rw-r--r--fs/ceph/mdsmap.c174
-rw-r--r--fs/ceph/mdsmap.h54
-rw-r--r--fs/ceph/messenger.c2240
-rw-r--r--fs/ceph/messenger.h254
-rw-r--r--fs/ceph/mon_client.c834
-rw-r--r--fs/ceph/mon_client.h119
-rw-r--r--fs/ceph/msgpool.c186
-rw-r--r--fs/ceph/msgpool.h27
-rw-r--r--fs/ceph/msgr.h158
-rw-r--r--fs/ceph/osd_client.c1537
-rw-r--r--fs/ceph/osd_client.h166
-rw-r--r--fs/ceph/osdmap.c1019
-rw-r--r--fs/ceph/osdmap.h125
-rw-r--r--fs/ceph/pagelist.c54
-rw-r--r--fs/ceph/pagelist.h54
-rw-r--r--fs/ceph/rados.h374
-rw-r--r--fs/ceph/snap.c904
-rw-r--r--fs/ceph/super.c1030
-rw-r--r--fs/ceph/super.h901
-rw-r--r--fs/ceph/types.h29
-rw-r--r--fs/ceph/xattr.c844
-rw-r--r--fs/cifs/cifsfs.c3
-rw-r--r--fs/cifs/cifsfs.h3
-rw-r--r--fs/cifs/cifsglob.h1
-rw-r--r--fs/cifs/cifsproto.h6
-rw-r--r--fs/cifs/cifssmb.c135
-rw-r--r--fs/cifs/dir.c2
-rw-r--r--fs/cifs/file.c7
-rw-r--r--fs/cifs/inode.c297
-rw-r--r--fs/fscache/page.c1
-rw-r--r--fs/nfs/file.c3
-rw-r--r--fs/nfs/nfs4xdr.c2
-rw-r--r--fs/partitions/msdos.c85
-rw-r--r--fs/proc/kcore.c2
-rw-r--r--fs/read_write.c2
-rw-r--r--fs/reiserfs/journal.c15
-rw-r--r--fs/reiserfs/xattr_security.c2
-rw-r--r--include/acpi/acpi_drivers.h2
-rw-r--r--include/linux/circ_buf.h4
-rw-r--r--include/linux/device.h4
-rw-r--r--include/linux/if_tunnel.h1
-rw-r--r--include/linux/kfifo.h2
-rw-r--r--include/linux/mmc/mmc.h1
-rw-r--r--include/linux/netdevice.h8
-rw-r--r--include/linux/netfilter/nfnetlink.h2
-rw-r--r--include/linux/netlink.h2
-rw-r--r--include/linux/reiserfs_xattr.h5
-rw-r--r--include/linux/serial_sci.h8
-rw-r--r--include/linux/sh_dma.h101
-rw-r--r--include/linux/sunrpc/bc_xprt.h5
-rw-r--r--include/linux/syscalls.h2
-rw-r--r--include/linux/tty.h10
-rw-r--r--include/linux/usb.h18
-rw-r--r--include/linux/vt.h3
-rw-r--r--include/net/bluetooth/bluetooth.h2
-rw-r--r--include/net/netlink.h6
-rw-r--r--init/main.c2
-rw-r--r--ipc/syscall.c2
-rw-r--r--kernel/cgroup.c1
-rw-r--r--kernel/cpuset.c106
-rw-r--r--kernel/kthread.c2
-rw-r--r--lib/Kconfig.debug3
-rw-r--r--mm/ksm.c2
-rw-r--r--mm/memcontrol.c50
-rw-r--r--mm/memory.c1
-rw-r--r--mm/mempolicy.c50
-rw-r--r--mm/mmu_context.c1
-rw-r--r--mm/nommu.c7
-rw-r--r--net/8021q/vlan_core.c4
-rw-r--r--net/bluetooth/hci_sysfs.c3
-rw-r--r--net/bluetooth/l2cap.c48
-rw-r--r--net/bluetooth/rfcomm/core.c41
-rw-r--r--net/bluetooth/rfcomm/sock.c38
-rw-r--r--net/bluetooth/sco.c38
-rw-r--r--net/core/dev.c8
-rw-r--r--net/ipv4/fib_trie.c4
-rw-r--r--net/ipv4/ip_gre.c4
-rw-r--r--net/ipv4/ipmr.c3
-rw-r--r--net/ipv4/route.c17
-rw-r--r--net/ipv4/tcp.c65
-rw-r--r--net/ipv4/tcp_input.c3
-rw-r--r--net/ipv4/tcp_ipv4.c5
-rw-r--r--net/ipv6/ip6mr.c3
-rw-r--r--net/ipv6/route.c2
-rw-r--r--net/netfilter/nf_conntrack_netlink.c4
-rw-r--r--net/netfilter/nfnetlink.c4
-rw-r--r--net/netlink/af_netlink.c17
-rw-r--r--net/rxrpc/ar-accept.c6
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c5
-rw-r--r--net/sunrpc/bc_svc.c15
-rw-r--r--net/sunrpc/clnt.c1
-rw-r--r--net/sunrpc/rpc_pipe.c2
-rw-r--r--net/sunrpc/xprt.c22
-rw-r--r--net/sunrpc/xprtsock.c3
-rwxr-xr-xscripts/get_maintainer.pl2
-rwxr-xr-xscripts/kernel-doc3
-rw-r--r--sound/arm/pxa2xx-pcm-lib.c1
-rw-r--r--sound/oss/vidc.c4
-rw-r--r--sound/pci/cmipci.c14
-rw-r--r--sound/pci/hda/patch_conexant.c15
-rw-r--r--sound/pci/hda/patch_nvhdmi.c15
-rw-r--r--sound/pci/hda/patch_realtek.c5
-rw-r--r--sound/pci/hda/patch_sigmatel.c2
-rw-r--r--sound/soc/codecs/tlv320dac33.c10
-rw-r--r--sound/soc/codecs/wm_hubs.c2
-rw-r--r--sound/soc/imx/Kconfig2
-rw-r--r--sound/soc/sh/Kconfig1
374 files changed, 31813 insertions, 1938 deletions
diff --git a/Documentation/ABI/testing/sysfs-bus-usb b/Documentation/ABI/testing/sysfs-bus-usb
index a986e9bbba3d..bcebb9eaedce 100644
--- a/Documentation/ABI/testing/sysfs-bus-usb
+++ b/Documentation/ABI/testing/sysfs-bus-usb
@@ -160,7 +160,7 @@ Description:
160 match the driver to the device. For example: 160 match the driver to the device. For example:
161 # echo "046d c315" > /sys/bus/usb/drivers/foo/remove_id 161 # echo "046d c315" > /sys/bus/usb/drivers/foo/remove_id
162 162
163What: /sys/bus/usb/device/.../avoid_reset 163What: /sys/bus/usb/device/.../avoid_reset_quirk
164Date: December 2009 164Date: December 2009
165Contact: Oliver Neukum <oliver@neukum.org> 165Contact: Oliver Neukum <oliver@neukum.org>
166Description: 166Description:
diff --git a/Documentation/PCI/PCI-DMA-mapping.txt b/Documentation/DMA-API-HOWTO.txt
index 52618ab069ad..52618ab069ad 100644
--- a/Documentation/PCI/PCI-DMA-mapping.txt
+++ b/Documentation/DMA-API-HOWTO.txt
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index f8bc802d70b9..3a6aecd078ba 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -340,7 +340,7 @@ Note:
3405.3 swappiness 3405.3 swappiness
341 Similar to /proc/sys/vm/swappiness, but affecting a hierarchy of groups only. 341 Similar to /proc/sys/vm/swappiness, but affecting a hierarchy of groups only.
342 342
343 Following cgroups' swapiness can't be changed. 343 Following cgroups' swappiness can't be changed.
344 - root cgroup (uses /proc/sys/vm/swappiness). 344 - root cgroup (uses /proc/sys/vm/swappiness).
345 - a cgroup which uses hierarchy and it has child cgroup. 345 - a cgroup which uses hierarchy and it has child cgroup.
346 - a cgroup which uses hierarchy and not the root of hierarchy. 346 - a cgroup which uses hierarchy and not the root of hierarchy.
diff --git a/Documentation/circular-buffers.txt b/Documentation/circular-buffers.txt
new file mode 100644
index 000000000000..8117e5bf6065
--- /dev/null
+++ b/Documentation/circular-buffers.txt
@@ -0,0 +1,234 @@
1 ================
2 CIRCULAR BUFFERS
3 ================
4
5By: David Howells <dhowells@redhat.com>
6 Paul E. McKenney <paulmck@linux.vnet.ibm.com>
7
8
9Linux provides a number of features that can be used to implement circular
10buffering. There are two sets of such features:
11
12 (1) Convenience functions for determining information about power-of-2 sized
13 buffers.
14
15 (2) Memory barriers for when the producer and the consumer of objects in the
16 buffer don't want to share a lock.
17
18To use these facilities, as discussed below, there needs to be just one
19producer and just one consumer. It is possible to handle multiple producers by
20serialising them, and to handle multiple consumers by serialising them.
21
22
23Contents:
24
25 (*) What is a circular buffer?
26
27 (*) Measuring power-of-2 buffers.
28
29 (*) Using memory barriers with circular buffers.
30 - The producer.
31 - The consumer.
32
33
34==========================
35WHAT IS A CIRCULAR BUFFER?
36==========================
37
38First of all, what is a circular buffer? A circular buffer is a buffer of
39fixed, finite size into which there are two indices:
40
41 (1) A 'head' index - the point at which the producer inserts items into the
42 buffer.
43
44 (2) A 'tail' index - the point at which the consumer finds the next item in
45 the buffer.
46
47Typically when the tail pointer is equal to the head pointer, the buffer is
48empty; and the buffer is full when the head pointer is one less than the tail
49pointer.
50
51The head index is incremented when items are added, and the tail index when
52items are removed. The tail index should never jump the head index, and both
53indices should be wrapped to 0 when they reach the end of the buffer, thus
54allowing an infinite amount of data to flow through the buffer.
55
56Typically, items will all be of the same unit size, but this isn't strictly
57required to use the techniques below. The indices can be increased by more
58than 1 if multiple items or variable-sized items are to be included in the
59buffer, provided that neither index overtakes the other. The implementer must
60be careful, however, as a region more than one unit in size may wrap the end of
61the buffer and be broken into two segments.
62
63
64============================
65MEASURING POWER-OF-2 BUFFERS
66============================
67
68Calculation of the occupancy or the remaining capacity of an arbitrarily sized
69circular buffer would normally be a slow operation, requiring the use of a
70modulus (divide) instruction. However, if the buffer is of a power-of-2 size,
71then a much quicker bitwise-AND instruction can be used instead.
72
73Linux provides a set of macros for handling power-of-2 circular buffers. These
74can be made use of by:
75
76 #include <linux/circ_buf.h>
77
78The macros are:
79
80 (*) Measure the remaining capacity of a buffer:
81
82 CIRC_SPACE(head_index, tail_index, buffer_size);
83
84 This returns the amount of space left in the buffer[1] into which items
85 can be inserted.
86
87
88 (*) Measure the maximum consecutive immediate space in a buffer:
89
90 CIRC_SPACE_TO_END(head_index, tail_index, buffer_size);
91
92 This returns the amount of consecutive space left in the buffer[1] into
93 which items can be immediately inserted without having to wrap back to the
94 beginning of the buffer.
95
96
97 (*) Measure the occupancy of a buffer:
98
99 CIRC_CNT(head_index, tail_index, buffer_size);
100
101 This returns the number of items currently occupying a buffer[2].
102
103
104 (*) Measure the non-wrapping occupancy of a buffer:
105
106 CIRC_CNT_TO_END(head_index, tail_index, buffer_size);
107
108 This returns the number of consecutive items[2] that can be extracted from
109 the buffer without having to wrap back to the beginning of the buffer.
110
111
112Each of these macros will nominally return a value between 0 and buffer_size-1,
113however:
114
115 [1] CIRC_SPACE*() are intended to be used in the producer. To the producer
116 they will return a lower bound as the producer controls the head index,
117 but the consumer may still be depleting the buffer on another CPU and
118 moving the tail index.
119
120 To the consumer it will show an upper bound as the producer may be busy
121 depleting the space.
122
123 [2] CIRC_CNT*() are intended to be used in the consumer. To the consumer they
124 will return a lower bound as the consumer controls the tail index, but the
125 producer may still be filling the buffer on another CPU and moving the
126 head index.
127
128 To the producer it will show an upper bound as the consumer may be busy
129 emptying the buffer.
130
131 [3] To a third party, the order in which the writes to the indices by the
132 producer and consumer become visible cannot be guaranteed as they are
133 independent and may be made on different CPUs - so the result in such a
134 situation will merely be a guess, and may even be negative.
135
136
137===========================================
138USING MEMORY BARRIERS WITH CIRCULAR BUFFERS
139===========================================
140
141By using memory barriers in conjunction with circular buffers, you can avoid
142the need to:
143
144 (1) use a single lock to govern access to both ends of the buffer, thus
145 allowing the buffer to be filled and emptied at the same time; and
146
147 (2) use atomic counter operations.
148
149There are two sides to this: the producer that fills the buffer, and the
150consumer that empties it. Only one thing should be filling a buffer at any one
151time, and only one thing should be emptying a buffer at any one time, but the
152two sides can operate simultaneously.
153
154
155THE PRODUCER
156------------
157
158The producer will look something like this:
159
160 spin_lock(&producer_lock);
161
162 unsigned long head = buffer->head;
163 unsigned long tail = ACCESS_ONCE(buffer->tail);
164
165 if (CIRC_SPACE(head, tail, buffer->size) >= 1) {
166 /* insert one item into the buffer */
167 struct item *item = buffer[head];
168
169 produce_item(item);
170
171 smp_wmb(); /* commit the item before incrementing the head */
172
173 buffer->head = (head + 1) & (buffer->size - 1);
174
175 /* wake_up() will make sure that the head is committed before
176 * waking anyone up */
177 wake_up(consumer);
178 }
179
180 spin_unlock(&producer_lock);
181
182This will instruct the CPU that the contents of the new item must be written
183before the head index makes it available to the consumer and then instructs the
184CPU that the revised head index must be written before the consumer is woken.
185
186Note that wake_up() doesn't have to be the exact mechanism used, but whatever
187is used must guarantee a (write) memory barrier between the update of the head
188index and the change of state of the consumer, if a change of state occurs.
189
190
191THE CONSUMER
192------------
193
194The consumer will look something like this:
195
196 spin_lock(&consumer_lock);
197
198 unsigned long head = ACCESS_ONCE(buffer->head);
199 unsigned long tail = buffer->tail;
200
201 if (CIRC_CNT(head, tail, buffer->size) >= 1) {
202 /* read index before reading contents at that index */
203 smp_read_barrier_depends();
204
205 /* extract one item from the buffer */
206 struct item *item = buffer[tail];
207
208 consume_item(item);
209
210 smp_mb(); /* finish reading descriptor before incrementing tail */
211
212 buffer->tail = (tail + 1) & (buffer->size - 1);
213 }
214
215 spin_unlock(&consumer_lock);
216
217This will instruct the CPU to make sure the index is up to date before reading
218the new item, and then it shall make sure the CPU has finished reading the item
219before it writes the new tail pointer, which will erase the item.
220
221
222Note the use of ACCESS_ONCE() in both algorithms to read the opposition index.
223This prevents the compiler from discarding and reloading its cached value -
224which some compilers will do across smp_read_barrier_depends(). This isn't
225strictly needed if you can be sure that the opposition index will _only_ be
226used the once.
227
228
229===============
230FURTHER READING
231===============
232
233See also Documentation/memory-barriers.txt for a description of Linux's memory
234barrier facilities.
diff --git a/Documentation/filesystems/ceph.txt b/Documentation/filesystems/ceph.txt
new file mode 100644
index 000000000000..6e03917316bd
--- /dev/null
+++ b/Documentation/filesystems/ceph.txt
@@ -0,0 +1,139 @@
1Ceph Distributed File System
2============================
3
4Ceph is a distributed network file system designed to provide good
5performance, reliability, and scalability.
6
7Basic features include:
8
9 * POSIX semantics
10 * Seamless scaling from 1 to many thousands of nodes
11 * High availability and reliability. No single points of failure.
12 * N-way replication of data across storage nodes
13 * Fast recovery from node failures
14 * Automatic rebalancing of data on node addition/removal
15 * Easy deployment: most FS components are userspace daemons
16
17Also,
18 * Flexible snapshots (on any directory)
19 * Recursive accounting (nested files, directories, bytes)
20
21In contrast to cluster filesystems like GFS, OCFS2, and GPFS that rely
22on symmetric access by all clients to shared block devices, Ceph
23separates data and metadata management into independent server
24clusters, similar to Lustre. Unlike Lustre, however, metadata and
25storage nodes run entirely as user space daemons. Storage nodes
26utilize btrfs to store data objects, leveraging its advanced features
27(checksumming, metadata replication, etc.). File data is striped
28across storage nodes in large chunks to distribute workload and
29facilitate high throughputs. When storage nodes fail, data is
30re-replicated in a distributed fashion by the storage nodes themselves
31(with some minimal coordination from a cluster monitor), making the
32system extremely efficient and scalable.
33
34Metadata servers effectively form a large, consistent, distributed
35in-memory cache above the file namespace that is extremely scalable,
36dynamically redistributes metadata in response to workload changes,
37and can tolerate arbitrary (well, non-Byzantine) node failures. The
38metadata server takes a somewhat unconventional approach to metadata
39storage to significantly improve performance for common workloads. In
40particular, inodes with only a single link are embedded in
41directories, allowing entire directories of dentries and inodes to be
42loaded into its cache with a single I/O operation. The contents of
43extremely large directories can be fragmented and managed by
44independent metadata servers, allowing scalable concurrent access.
45
46The system offers automatic data rebalancing/migration when scaling
47from a small cluster of just a few nodes to many hundreds, without
48requiring an administrator carve the data set into static volumes or
49go through the tedious process of migrating data between servers.
50When the file system approaches full, new nodes can be easily added
51and things will "just work."
52
53Ceph includes flexible snapshot mechanism that allows a user to create
54a snapshot on any subdirectory (and its nested contents) in the
55system. Snapshot creation and deletion are as simple as 'mkdir
56.snap/foo' and 'rmdir .snap/foo'.
57
58Ceph also provides some recursive accounting on directories for nested
59files and bytes. That is, a 'getfattr -d foo' on any directory in the
60system will reveal the total number of nested regular files and
61subdirectories, and a summation of all nested file sizes. This makes
62the identification of large disk space consumers relatively quick, as
63no 'du' or similar recursive scan of the file system is required.
64
65
66Mount Syntax
67============
68
69The basic mount syntax is:
70
71 # mount -t ceph monip[:port][,monip2[:port]...]:/[subdir] mnt
72
73You only need to specify a single monitor, as the client will get the
74full list when it connects. (However, if the monitor you specify
75happens to be down, the mount won't succeed.) The port can be left
76off if the monitor is using the default. So if the monitor is at
771.2.3.4,
78
79 # mount -t ceph 1.2.3.4:/ /mnt/ceph
80
81is sufficient. If /sbin/mount.ceph is installed, a hostname can be
82used instead of an IP address.
83
84
85
86Mount Options
87=============
88
89 ip=A.B.C.D[:N]
90 Specify the IP and/or port the client should bind to locally.
91 There is normally not much reason to do this. If the IP is not
92 specified, the client's IP address is determined by looking at the
93 address it's connection to the monitor originates from.
94
95 wsize=X
96 Specify the maximum write size in bytes. By default there is no
97 maximu. Ceph will normally size writes based on the file stripe
98 size.
99
100 rsize=X
101 Specify the maximum readahead.
102
103 mount_timeout=X
104 Specify the timeout value for mount (in seconds), in the case
105 of a non-responsive Ceph file system. The default is 30
106 seconds.
107
108 rbytes
109 When stat() is called on a directory, set st_size to 'rbytes',
110 the summation of file sizes over all files nested beneath that
111 directory. This is the default.
112
113 norbytes
114 When stat() is called on a directory, set st_size to the
115 number of entries in that directory.
116
117 nocrc
118 Disable CRC32C calculation for data writes. If set, the OSD
119 must rely on TCP's error correction to detect data corruption
120 in the data payload.
121
122 noasyncreaddir
123 Disable client's use its local cache to satisfy readdir
124 requests. (This does not change correctness; the client uses
125 cached metadata only when a lease or capability ensures it is
126 valid.)
127
128
129More Information
130================
131
132For more information on Ceph, see the home page at
133 http://ceph.newdream.net/
134
135The Linux kernel client source tree is available at
136 git://ceph.newdream.net/linux-ceph-client.git
137
138and the source for the full system is at
139 git://ceph.newdream.net/ceph.git
diff --git a/Documentation/filesystems/tmpfs.txt b/Documentation/filesystems/tmpfs.txt
index 3015da0c6b2a..fe09a2cb1858 100644
--- a/Documentation/filesystems/tmpfs.txt
+++ b/Documentation/filesystems/tmpfs.txt
@@ -82,11 +82,13 @@ tmpfs has a mount option to set the NUMA memory allocation policy for
82all files in that instance (if CONFIG_NUMA is enabled) - which can be 82all files in that instance (if CONFIG_NUMA is enabled) - which can be
83adjusted on the fly via 'mount -o remount ...' 83adjusted on the fly via 'mount -o remount ...'
84 84
85mpol=default prefers to allocate memory from the local node 85mpol=default use the process allocation policy
86 (see set_mempolicy(2))
86mpol=prefer:Node prefers to allocate memory from the given Node 87mpol=prefer:Node prefers to allocate memory from the given Node
87mpol=bind:NodeList allocates memory only from nodes in NodeList 88mpol=bind:NodeList allocates memory only from nodes in NodeList
88mpol=interleave prefers to allocate from each node in turn 89mpol=interleave prefers to allocate from each node in turn
89mpol=interleave:NodeList allocates from each node of NodeList in turn 90mpol=interleave:NodeList allocates from each node of NodeList in turn
91mpol=local prefers to allocate memory from the local node
90 92
91NodeList format is a comma-separated list of decimal numbers and ranges, 93NodeList format is a comma-separated list of decimal numbers and ranges,
92a range being two hyphen-separated decimal numbers, the smallest and 94a range being two hyphen-separated decimal numbers, the smallest and
@@ -134,3 +136,5 @@ Author:
134 Christoph Rohland <cr@sap.com>, 1.12.01 136 Christoph Rohland <cr@sap.com>, 1.12.01
135Updated: 137Updated:
136 Hugh Dickins, 4 June 2007 138 Hugh Dickins, 4 June 2007
139Updated:
140 KOSAKI Motohiro, 16 Mar 2010
diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index 35c9b51d20ea..dd5806f4fcc4 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -291,6 +291,7 @@ Code Seq#(hex) Include File Comments
2910x92 00-0F drivers/usb/mon/mon_bin.c 2910x92 00-0F drivers/usb/mon/mon_bin.c
2920x93 60-7F linux/auto_fs.h 2920x93 60-7F linux/auto_fs.h
2930x94 all fs/btrfs/ioctl.h 2930x94 all fs/btrfs/ioctl.h
2940x97 00-7F fs/ceph/ioctl.h Ceph file system
2940x99 00-0F 537-Addinboard driver 2950x99 00-0F 537-Addinboard driver
295 <mailto:buk@buks.ipn.de> 296 <mailto:buk@buks.ipn.de>
2960xA0 all linux/sdp/sdp.h Industrial Device Project 2970xA0 all linux/sdp/sdp.h Industrial Device Project
diff --git a/Documentation/kobject.txt b/Documentation/kobject.txt
index bdb13817e1e9..3ab2472509cb 100644
--- a/Documentation/kobject.txt
+++ b/Documentation/kobject.txt
@@ -59,37 +59,56 @@ nice to have in other objects. The C language does not allow for the
59direct expression of inheritance, so other techniques - such as structure 59direct expression of inheritance, so other techniques - such as structure
60embedding - must be used. 60embedding - must be used.
61 61
62So, for example, the UIO code has a structure that defines the memory 62(As an aside, for those familiar with the kernel linked list implementation,
63region associated with a uio device: 63this is analogous as to how "list_head" structs are rarely useful on
64their own, but are invariably found embedded in the larger objects of
65interest.)
64 66
65struct uio_mem { 67So, for example, the UIO code in drivers/uio/uio.c has a structure that
68defines the memory region associated with a uio device:
69
70 struct uio_map {
66 struct kobject kobj; 71 struct kobject kobj;
67 unsigned long addr; 72 struct uio_mem *mem;
68 unsigned long size; 73 };
69 int memtype;
70 void __iomem *internal_addr;
71};
72 74
73If you have a struct uio_mem structure, finding its embedded kobject is 75If you have a struct uio_map structure, finding its embedded kobject is
74just a matter of using the kobj member. Code that works with kobjects will 76just a matter of using the kobj member. Code that works with kobjects will
75often have the opposite problem, however: given a struct kobject pointer, 77often have the opposite problem, however: given a struct kobject pointer,
76what is the pointer to the containing structure? You must avoid tricks 78what is the pointer to the containing structure? You must avoid tricks
77(such as assuming that the kobject is at the beginning of the structure) 79(such as assuming that the kobject is at the beginning of the structure)
78and, instead, use the container_of() macro, found in <linux/kernel.h>: 80and, instead, use the container_of() macro, found in <linux/kernel.h>:
79 81
80 container_of(pointer, type, member) 82 container_of(pointer, type, member)
83
84where:
85
86 * "pointer" is the pointer to the embedded kobject,
87 * "type" is the type of the containing structure, and
88 * "member" is the name of the structure field to which "pointer" points.
89
90The return value from container_of() is a pointer to the corresponding
91container type. So, for example, a pointer "kp" to a struct kobject
92embedded *within* a struct uio_map could be converted to a pointer to the
93*containing* uio_map structure with:
94
95 struct uio_map *u_map = container_of(kp, struct uio_map, kobj);
96
97For convenience, programmers often define a simple macro for "back-casting"
98kobject pointers to the containing type. Exactly this happens in the
99earlier drivers/uio/uio.c, as you can see here:
100
101 struct uio_map {
102 struct kobject kobj;
103 struct uio_mem *mem;
104 };
81 105
82where pointer is the pointer to the embedded kobject, type is the type of 106 #define to_map(map) container_of(map, struct uio_map, kobj)
83the containing structure, and member is the name of the structure field to
84which pointer points. The return value from container_of() is a pointer to
85the given type. So, for example, a pointer "kp" to a struct kobject
86embedded within a struct uio_mem could be converted to a pointer to the
87containing uio_mem structure with:
88 107
89 struct uio_mem *u_mem = container_of(kp, struct uio_mem, kobj); 108where the macro argument "map" is a pointer to the struct kobject in
109question. That macro is subsequently invoked with:
90 110
91Programmers often define a simple macro for "back-casting" kobject pointers 111 struct uio_map *map = to_map(kobj);
92to the containing type.
93 112
94 113
95Initialization of kobjects 114Initialization of kobjects
@@ -387,4 +406,5 @@ called, and the objects in the former circle release each other.
387Example code to copy from 406Example code to copy from
388 407
389For a more complete example of using ksets and kobjects properly, see the 408For a more complete example of using ksets and kobjects properly, see the
390sample/kobject/kset-example.c code. 409example programs samples/kobject/{kobject-example.c,kset-example.c},
410which will be built as loadable modules if you select CONFIG_SAMPLE_KOBJECT.
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index 7f5809eddee6..631ad2f1b229 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -3,6 +3,7 @@
3 ============================ 3 ============================
4 4
5By: David Howells <dhowells@redhat.com> 5By: David Howells <dhowells@redhat.com>
6 Paul E. McKenney <paulmck@linux.vnet.ibm.com>
6 7
7Contents: 8Contents:
8 9
@@ -60,6 +61,10 @@ Contents:
60 61
61 - And then there's the Alpha. 62 - And then there's the Alpha.
62 63
64 (*) Example uses.
65
66 - Circular buffers.
67
63 (*) References. 68 (*) References.
64 69
65 70
@@ -2226,6 +2231,21 @@ The Alpha defines the Linux kernel's memory barrier model.
2226See the subsection on "Cache Coherency" above. 2231See the subsection on "Cache Coherency" above.
2227 2232
2228 2233
2234============
2235EXAMPLE USES
2236============
2237
2238CIRCULAR BUFFERS
2239----------------
2240
2241Memory barriers can be used to implement circular buffering without the need
2242of a lock to serialise the producer with the consumer. See:
2243
2244 Documentation/circular-buffers.txt
2245
2246for details.
2247
2248
2229========== 2249==========
2230REFERENCES 2250REFERENCES
2231========== 2251==========
diff --git a/Documentation/volatile-considered-harmful.txt b/Documentation/volatile-considered-harmful.txt
index 991c26a6ef64..db0cb228d64a 100644
--- a/Documentation/volatile-considered-harmful.txt
+++ b/Documentation/volatile-considered-harmful.txt
@@ -63,9 +63,9 @@ way to perform a busy wait is:
63 cpu_relax(); 63 cpu_relax();
64 64
65The cpu_relax() call can lower CPU power consumption or yield to a 65The cpu_relax() call can lower CPU power consumption or yield to a
66hyperthreaded twin processor; it also happens to serve as a memory barrier, 66hyperthreaded twin processor; it also happens to serve as a compiler
67so, once again, volatile is unnecessary. Of course, busy-waiting is 67barrier, so, once again, volatile is unnecessary. Of course, busy-
68generally an anti-social act to begin with. 68waiting is generally an anti-social act to begin with.
69 69
70There are still a few rare situations where volatile makes sense in the 70There are still a few rare situations where volatile makes sense in the
71kernel: 71kernel:
diff --git a/MAINTAINERS b/MAINTAINERS
index 382eaa4d0068..fbc3d653d52b 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -797,12 +797,12 @@ M: Michael Petchkovsky <mkpetch@internode.on.net>
797S: Maintained 797S: Maintained
798 798
799ARM/NOMADIK ARCHITECTURE 799ARM/NOMADIK ARCHITECTURE
800M: Alessandro Rubini <rubini@unipv.it> 800M: Alessandro Rubini <rubini@unipv.it>
801M: STEricsson <STEricsson_nomadik_linux@list.st.com> 801M: STEricsson <STEricsson_nomadik_linux@list.st.com>
802L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) 802L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
803S: Maintained 803S: Maintained
804F: arch/arm/mach-nomadik/ 804F: arch/arm/mach-nomadik/
805F: arch/arm/plat-nomadik/ 805F: arch/arm/plat-nomadik/
806 806
807ARM/OPENMOKO NEO FREERUNNER (GTA02) MACHINE SUPPORT 807ARM/OPENMOKO NEO FREERUNNER (GTA02) MACHINE SUPPORT
808M: Nelson Castillo <arhuaco@freaks-unidos.net> 808M: Nelson Castillo <arhuaco@freaks-unidos.net>
@@ -1441,6 +1441,15 @@ F: arch/powerpc/include/asm/spu*.h
1441F: arch/powerpc/oprofile/*cell* 1441F: arch/powerpc/oprofile/*cell*
1442F: arch/powerpc/platforms/cell/ 1442F: arch/powerpc/platforms/cell/
1443 1443
1444CEPH DISTRIBUTED FILE SYSTEM CLIENT
1445M: Sage Weil <sage@newdream.net>
1446L: ceph-devel@lists.sourceforge.net
1447W: http://ceph.newdream.net/
1448T: git git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git
1449S: Supported
1450F: Documentation/filesystems/ceph.txt
1451F: fs/ceph
1452
1444CERTIFIED WIRELESS USB (WUSB) SUBSYSTEM: 1453CERTIFIED WIRELESS USB (WUSB) SUBSYSTEM:
1445M: David Vrabel <david.vrabel@csr.com> 1454M: David Vrabel <david.vrabel@csr.com>
1446L: linux-usb@vger.kernel.org 1455L: linux-usb@vger.kernel.org
@@ -1917,17 +1926,17 @@ F: drivers/scsi/dpt*
1917F: drivers/scsi/dpt/ 1926F: drivers/scsi/dpt/
1918 1927
1919DRBD DRIVER 1928DRBD DRIVER
1920P: Philipp Reisner 1929P: Philipp Reisner
1921P: Lars Ellenberg 1930P: Lars Ellenberg
1922M: drbd-dev@lists.linbit.com 1931M: drbd-dev@lists.linbit.com
1923L: drbd-user@lists.linbit.com 1932L: drbd-user@lists.linbit.com
1924W: http://www.drbd.org 1933W: http://www.drbd.org
1925T: git git://git.drbd.org/linux-2.6-drbd.git drbd 1934T: git git://git.drbd.org/linux-2.6-drbd.git drbd
1926T: git git://git.drbd.org/drbd-8.3.git 1935T: git git://git.drbd.org/drbd-8.3.git
1927S: Supported 1936S: Supported
1928F: drivers/block/drbd/ 1937F: drivers/block/drbd/
1929F: lib/lru_cache.c 1938F: lib/lru_cache.c
1930F: Documentation/blockdev/drbd/ 1939F: Documentation/blockdev/drbd/
1931 1940
1932DRIVER CORE, KOBJECTS, AND SYSFS 1941DRIVER CORE, KOBJECTS, AND SYSFS
1933M: Greg Kroah-Hartman <gregkh@suse.de> 1942M: Greg Kroah-Hartman <gregkh@suse.de>
@@ -3509,8 +3518,8 @@ F: drivers/scsi/sym53c8xx_2/
3509LTP (Linux Test Project) 3518LTP (Linux Test Project)
3510M: Rishikesh K Rajak <risrajak@linux.vnet.ibm.com> 3519M: Rishikesh K Rajak <risrajak@linux.vnet.ibm.com>
3511M: Garrett Cooper <yanegomi@gmail.com> 3520M: Garrett Cooper <yanegomi@gmail.com>
3512M: Mike Frysinger <vapier@gentoo.org> 3521M: Mike Frysinger <vapier@gentoo.org>
3513M: Subrata Modak <subrata@linux.vnet.ibm.com> 3522M: Subrata Modak <subrata@linux.vnet.ibm.com>
3514L: ltp-list@lists.sourceforge.net (subscribers-only) 3523L: ltp-list@lists.sourceforge.net (subscribers-only)
3515W: http://ltp.sourceforge.net/ 3524W: http://ltp.sourceforge.net/
3516T: git git://git.kernel.org/pub/scm/linux/kernel/git/galak/ltp.git 3525T: git git://git.kernel.org/pub/scm/linux/kernel/git/galak/ltp.git
@@ -6192,7 +6201,7 @@ F: arch/x86/
6192X86 PLATFORM DRIVERS 6201X86 PLATFORM DRIVERS
6193M: Matthew Garrett <mjg@redhat.com> 6202M: Matthew Garrett <mjg@redhat.com>
6194L: platform-driver-x86@vger.kernel.org 6203L: platform-driver-x86@vger.kernel.org
6195T: git git://git.kernel.org/pub/scm/linux/kernel/git/mjg59/platform-drivers-x86.git 6204T: git git://git.kernel.org/pub/scm/linux/kernel/git/mjg59/platform-drivers-x86.git
6196S: Maintained 6205S: Maintained
6197F: drivers/platform/x86 6206F: drivers/platform/x86
6198 6207
diff --git a/Makefile b/Makefile
index 08ff02da7ce3..a5ba759e0fd5 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
1VERSION = 2 1VERSION = 2
2PATCHLEVEL = 6 2PATCHLEVEL = 6
3SUBLEVEL = 34 3SUBLEVEL = 34
4EXTRAVERSION = -rc1 4EXTRAVERSION = -rc2
5NAME = Man-Eating Seals of Antiquity 5NAME = Man-Eating Seals of Antiquity
6 6
7# *DOCUMENTATION* 7# *DOCUMENTATION*
diff --git a/arch/alpha/include/asm/core_marvel.h b/arch/alpha/include/asm/core_marvel.h
index 30d55fe7aaf6..dad300fa14ce 100644
--- a/arch/alpha/include/asm/core_marvel.h
+++ b/arch/alpha/include/asm/core_marvel.h
@@ -12,7 +12,6 @@
12#define __ALPHA_MARVEL__H__ 12#define __ALPHA_MARVEL__H__
13 13
14#include <linux/types.h> 14#include <linux/types.h>
15#include <linux/pci.h>
16#include <linux/spinlock.h> 15#include <linux/spinlock.h>
17 16
18#include <asm/compiler.h> 17#include <asm/compiler.h>
diff --git a/arch/alpha/include/asm/core_mcpcia.h b/arch/alpha/include/asm/core_mcpcia.h
index acf55b483472..21ac53383b37 100644
--- a/arch/alpha/include/asm/core_mcpcia.h
+++ b/arch/alpha/include/asm/core_mcpcia.h
@@ -6,7 +6,6 @@
6#define MCPCIA_ONE_HAE_WINDOW 1 6#define MCPCIA_ONE_HAE_WINDOW 1
7 7
8#include <linux/types.h> 8#include <linux/types.h>
9#include <linux/pci.h>
10#include <asm/compiler.h> 9#include <asm/compiler.h>
11 10
12/* 11/*
diff --git a/arch/alpha/include/asm/core_titan.h b/arch/alpha/include/asm/core_titan.h
index a17f6f33b68e..8cf79d1219e1 100644
--- a/arch/alpha/include/asm/core_titan.h
+++ b/arch/alpha/include/asm/core_titan.h
@@ -2,7 +2,6 @@
2#define __ALPHA_TITAN__H__ 2#define __ALPHA_TITAN__H__
3 3
4#include <linux/types.h> 4#include <linux/types.h>
5#include <linux/pci.h>
6#include <asm/compiler.h> 5#include <asm/compiler.h>
7 6
8/* 7/*
diff --git a/arch/alpha/include/asm/core_tsunami.h b/arch/alpha/include/asm/core_tsunami.h
index 58d4fe48742c..8e39ecf09419 100644
--- a/arch/alpha/include/asm/core_tsunami.h
+++ b/arch/alpha/include/asm/core_tsunami.h
@@ -2,7 +2,6 @@
2#define __ALPHA_TSUNAMI__H__ 2#define __ALPHA_TSUNAMI__H__
3 3
4#include <linux/types.h> 4#include <linux/types.h>
5#include <linux/pci.h>
6#include <asm/compiler.h> 5#include <asm/compiler.h>
7 6
8/* 7/*
diff --git a/arch/alpha/kernel/sys_dp264.c b/arch/alpha/kernel/sys_dp264.c
index d64e1e497e76..4026502ab707 100644
--- a/arch/alpha/kernel/sys_dp264.c
+++ b/arch/alpha/kernel/sys_dp264.c
@@ -224,7 +224,7 @@ static void
224dp264_device_interrupt(unsigned long vector) 224dp264_device_interrupt(unsigned long vector)
225{ 225{
226#if 1 226#if 1
227 printk("dp264_device_interrupt: NOT IMPLEMENTED YET!! \n"); 227 printk("dp264_device_interrupt: NOT IMPLEMENTED YET!!\n");
228#else 228#else
229 unsigned long pld; 229 unsigned long pld;
230 unsigned int i; 230 unsigned int i;
diff --git a/arch/alpha/kernel/sys_titan.c b/arch/alpha/kernel/sys_titan.c
index 288053342c83..9008d0f20c53 100644
--- a/arch/alpha/kernel/sys_titan.c
+++ b/arch/alpha/kernel/sys_titan.c
@@ -171,7 +171,7 @@ titan_set_irq_affinity(unsigned int irq, const struct cpumask *affinity)
171static void 171static void
172titan_device_interrupt(unsigned long vector) 172titan_device_interrupt(unsigned long vector)
173{ 173{
174 printk("titan_device_interrupt: NOT IMPLEMENTED YET!! \n"); 174 printk("titan_device_interrupt: NOT IMPLEMENTED YET!!\n");
175} 175}
176 176
177static void 177static void
diff --git a/arch/alpha/kernel/traps.c b/arch/alpha/kernel/traps.c
index 6ee7655b7568..b14f015008ad 100644
--- a/arch/alpha/kernel/traps.c
+++ b/arch/alpha/kernel/traps.c
@@ -17,6 +17,7 @@
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/init.h> 18#include <linux/init.h>
19#include <linux/kallsyms.h> 19#include <linux/kallsyms.h>
20#include <linux/ratelimit.h>
20 21
21#include <asm/gentrap.h> 22#include <asm/gentrap.h>
22#include <asm/uaccess.h> 23#include <asm/uaccess.h>
@@ -771,8 +772,7 @@ asmlinkage void
771do_entUnaUser(void __user * va, unsigned long opcode, 772do_entUnaUser(void __user * va, unsigned long opcode,
772 unsigned long reg, struct pt_regs *regs) 773 unsigned long reg, struct pt_regs *regs)
773{ 774{
774 static int cnt = 0; 775 static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 5);
775 static unsigned long last_time;
776 776
777 unsigned long tmp1, tmp2, tmp3, tmp4; 777 unsigned long tmp1, tmp2, tmp3, tmp4;
778 unsigned long fake_reg, *reg_addr = &fake_reg; 778 unsigned long fake_reg, *reg_addr = &fake_reg;
@@ -783,15 +783,11 @@ do_entUnaUser(void __user * va, unsigned long opcode,
783 with the unaliged access. */ 783 with the unaliged access. */
784 784
785 if (!test_thread_flag (TIF_UAC_NOPRINT)) { 785 if (!test_thread_flag (TIF_UAC_NOPRINT)) {
786 if (cnt >= 5 && time_after(jiffies, last_time + 5 * HZ)) { 786 if (__ratelimit(&ratelimit)) {
787 cnt = 0;
788 }
789 if (++cnt < 5) {
790 printk("%s(%d): unaligned trap at %016lx: %p %lx %ld\n", 787 printk("%s(%d): unaligned trap at %016lx: %p %lx %ld\n",
791 current->comm, task_pid_nr(current), 788 current->comm, task_pid_nr(current),
792 regs->pc - 4, va, opcode, reg); 789 regs->pc - 4, va, opcode, reg);
793 } 790 }
794 last_time = jiffies;
795 } 791 }
796 if (test_thread_flag (TIF_UAC_SIGBUS)) 792 if (test_thread_flag (TIF_UAC_SIGBUS))
797 goto give_sigbus; 793 goto give_sigbus;
diff --git a/arch/arm/common/locomo.c b/arch/arm/common/locomo.c
index 90ae00b631c2..9dff07c80ddb 100644
--- a/arch/arm/common/locomo.c
+++ b/arch/arm/common/locomo.c
@@ -290,7 +290,7 @@ static int locomo_suspend(struct platform_device *dev, pm_message_t state)
290 save->LCM_GPO = locomo_readl(lchip->base + LOCOMO_GPO); /* GPIO */ 290 save->LCM_GPO = locomo_readl(lchip->base + LOCOMO_GPO); /* GPIO */
291 locomo_writel(0x00, lchip->base + LOCOMO_GPO); 291 locomo_writel(0x00, lchip->base + LOCOMO_GPO);
292 save->LCM_SPICT = locomo_readl(lchip->base + LOCOMO_SPI + LOCOMO_SPICT); /* SPI */ 292 save->LCM_SPICT = locomo_readl(lchip->base + LOCOMO_SPI + LOCOMO_SPICT); /* SPI */
293 locomo_writel(0x40, lchip->base + LOCOMO_SPICT); 293 locomo_writel(0x40, lchip->base + LOCOMO_SPI + LOCOMO_SPICT);
294 save->LCM_GPE = locomo_readl(lchip->base + LOCOMO_GPE); /* GPIO */ 294 save->LCM_GPE = locomo_readl(lchip->base + LOCOMO_GPE); /* GPIO */
295 locomo_writel(0x00, lchip->base + LOCOMO_GPE); 295 locomo_writel(0x00, lchip->base + LOCOMO_GPE);
296 save->LCM_ASD = locomo_readl(lchip->base + LOCOMO_ASD); /* ADSTART */ 296 save->LCM_ASD = locomo_readl(lchip->base + LOCOMO_ASD); /* ADSTART */
@@ -418,7 +418,7 @@ __locomo_probe(struct device *me, struct resource *mem, int irq)
418 /* Longtime timer */ 418 /* Longtime timer */
419 locomo_writel(0, lchip->base + LOCOMO_LTINT); 419 locomo_writel(0, lchip->base + LOCOMO_LTINT);
420 /* SPI */ 420 /* SPI */
421 locomo_writel(0, lchip->base + LOCOMO_SPIIE); 421 locomo_writel(0, lchip->base + LOCOMO_SPI + LOCOMO_SPIIE);
422 422
423 locomo_writel(6 + 8 + 320 + 30 - 10, lchip->base + LOCOMO_ASD); 423 locomo_writel(6 + 8 + 320 + 30 - 10, lchip->base + LOCOMO_ASD);
424 r = locomo_readl(lchip->base + LOCOMO_ASD); 424 r = locomo_readl(lchip->base + LOCOMO_ASD);
@@ -707,7 +707,7 @@ void locomo_m62332_senddata(struct locomo_dev *ldev, unsigned int dac_data, int
707 udelay(DAC_SCL_HIGH_HOLD_TIME); /* 4.7 usec */ 707 udelay(DAC_SCL_HIGH_HOLD_TIME); /* 4.7 usec */
708 if (locomo_readl(mapbase + LOCOMO_DAC) & LOCOMO_DAC_SDAOEB) { /* High is error */ 708 if (locomo_readl(mapbase + LOCOMO_DAC) & LOCOMO_DAC_SDAOEB) { /* High is error */
709 printk(KERN_WARNING "locomo: m62332_senddata Error 1\n"); 709 printk(KERN_WARNING "locomo: m62332_senddata Error 1\n");
710 return; 710 goto out;
711 } 711 }
712 712
713 /* Send Sub address (LSB is channel select) */ 713 /* Send Sub address (LSB is channel select) */
@@ -735,7 +735,7 @@ void locomo_m62332_senddata(struct locomo_dev *ldev, unsigned int dac_data, int
735 udelay(DAC_SCL_HIGH_HOLD_TIME); /* 4.7 usec */ 735 udelay(DAC_SCL_HIGH_HOLD_TIME); /* 4.7 usec */
736 if (locomo_readl(mapbase + LOCOMO_DAC) & LOCOMO_DAC_SDAOEB) { /* High is error */ 736 if (locomo_readl(mapbase + LOCOMO_DAC) & LOCOMO_DAC_SDAOEB) { /* High is error */
737 printk(KERN_WARNING "locomo: m62332_senddata Error 2\n"); 737 printk(KERN_WARNING "locomo: m62332_senddata Error 2\n");
738 return; 738 goto out;
739 } 739 }
740 740
741 /* Send DAC data */ 741 /* Send DAC data */
@@ -760,9 +760,9 @@ void locomo_m62332_senddata(struct locomo_dev *ldev, unsigned int dac_data, int
760 udelay(DAC_SCL_HIGH_HOLD_TIME); /* 4.7 usec */ 760 udelay(DAC_SCL_HIGH_HOLD_TIME); /* 4.7 usec */
761 if (locomo_readl(mapbase + LOCOMO_DAC) & LOCOMO_DAC_SDAOEB) { /* High is error */ 761 if (locomo_readl(mapbase + LOCOMO_DAC) & LOCOMO_DAC_SDAOEB) { /* High is error */
762 printk(KERN_WARNING "locomo: m62332_senddata Error 3\n"); 762 printk(KERN_WARNING "locomo: m62332_senddata Error 3\n");
763 return;
764 } 763 }
765 764
765out:
766 /* stop */ 766 /* stop */
767 r = locomo_readl(mapbase + LOCOMO_DAC); 767 r = locomo_readl(mapbase + LOCOMO_DAC);
768 r &= ~(LOCOMO_DAC_SCLOEB); 768 r &= ~(LOCOMO_DAC_SCLOEB);
diff --git a/arch/arm/mach-ixp23xx/include/mach/memory.h b/arch/arm/mach-ixp23xx/include/mach/memory.h
index 94a3a86cfeb8..6ef65d813f16 100644
--- a/arch/arm/mach-ixp23xx/include/mach/memory.h
+++ b/arch/arm/mach-ixp23xx/include/mach/memory.h
@@ -19,7 +19,7 @@
19 */ 19 */
20#define PHYS_OFFSET (0x00000000) 20#define PHYS_OFFSET (0x00000000)
21 21
22#define IXP23XX_PCI_SDRAM_OFFSET (*((volatile int *)IXP23XX_PCI_SDRAM_BAR) & 0xfffffff0)) 22#define IXP23XX_PCI_SDRAM_OFFSET (*((volatile int *)IXP23XX_PCI_SDRAM_BAR) & 0xfffffff0)
23 23
24#define __phys_to_bus(x) ((x) + (IXP23XX_PCI_SDRAM_OFFSET - PHYS_OFFSET)) 24#define __phys_to_bus(x) ((x) + (IXP23XX_PCI_SDRAM_OFFSET - PHYS_OFFSET))
25#define __bus_to_phys(x) ((x) - (IXP23XX_PCI_SDRAM_OFFSET - PHYS_OFFSET)) 25#define __bus_to_phys(x) ((x) - (IXP23XX_PCI_SDRAM_OFFSET - PHYS_OFFSET))
diff --git a/arch/arm/mach-kirkwood/mv88f6281gtw_ge-setup.c b/arch/arm/mach-kirkwood/mv88f6281gtw_ge-setup.c
index 0358f45766cb..5e6f711b1c67 100644
--- a/arch/arm/mach-kirkwood/mv88f6281gtw_ge-setup.c
+++ b/arch/arm/mach-kirkwood/mv88f6281gtw_ge-setup.c
@@ -74,9 +74,9 @@ static struct gpio_keys_button mv88f6281gtw_ge_button_pins[] = {
74 .desc = "SWR Button", 74 .desc = "SWR Button",
75 .active_low = 1, 75 .active_low = 1,
76 }, { 76 }, {
77 .code = KEY_F1, 77 .code = KEY_WPS_BUTTON,
78 .gpio = 46, 78 .gpio = 46,
79 .desc = "WPS Button(F1)", 79 .desc = "WPS Button",
80 .active_low = 1, 80 .active_low = 1,
81 }, 81 },
82}; 82};
diff --git a/arch/arm/mach-mmp/include/mach/uncompress.h b/arch/arm/mach-mmp/include/mach/uncompress.h
index a7dcc5307216..85bd8a2d84b5 100644
--- a/arch/arm/mach-mmp/include/mach/uncompress.h
+++ b/arch/arm/mach-mmp/include/mach/uncompress.h
@@ -14,7 +14,7 @@
14#define UART2_BASE (APB_PHYS_BASE + 0x17000) 14#define UART2_BASE (APB_PHYS_BASE + 0x17000)
15#define UART3_BASE (APB_PHYS_BASE + 0x18000) 15#define UART3_BASE (APB_PHYS_BASE + 0x18000)
16 16
17static volatile unsigned long *UART = (unsigned long *)UART2_BASE; 17static volatile unsigned long *UART;
18 18
19static inline void putc(char c) 19static inline void putc(char c)
20{ 20{
@@ -37,6 +37,9 @@ static inline void flush(void)
37 37
38static inline void arch_decomp_setup(void) 38static inline void arch_decomp_setup(void)
39{ 39{
40 /* default to UART2 */
41 UART = (unsigned long *)UART2_BASE;
42
40 if (machine_is_avengers_lite()) 43 if (machine_is_avengers_lite())
41 UART = (unsigned long *)UART3_BASE; 44 UART = (unsigned long *)UART3_BASE;
42} 45}
diff --git a/arch/arm/mach-orion5x/wrt350n-v2-setup.c b/arch/arm/mach-orion5x/wrt350n-v2-setup.c
index cb0feca193d4..f9f222ebb7ed 100644
--- a/arch/arm/mach-orion5x/wrt350n-v2-setup.c
+++ b/arch/arm/mach-orion5x/wrt350n-v2-setup.c
@@ -77,7 +77,7 @@ static struct gpio_keys_button wrt350n_v2_buttons[] = {
77 .desc = "Reset Button", 77 .desc = "Reset Button",
78 .active_low = 1, 78 .active_low = 1,
79 }, { 79 }, {
80 .code = KEY_WLAN, 80 .code = KEY_WPS_BUTTON,
81 .gpio = 2, 81 .gpio = 2,
82 .desc = "WPS Button", 82 .desc = "WPS Button",
83 .active_low = 1, 83 .active_low = 1,
diff --git a/arch/arm/mach-pxa/Kconfig b/arch/arm/mach-pxa/Kconfig
index 38fbd0a0e402..5b6ee46fa7f6 100644
--- a/arch/arm/mach-pxa/Kconfig
+++ b/arch/arm/mach-pxa/Kconfig
@@ -272,7 +272,6 @@ config MACH_H5000
272config MACH_HIMALAYA 272config MACH_HIMALAYA
273 bool "HTC Himalaya Support" 273 bool "HTC Himalaya Support"
274 select CPU_PXA26x 274 select CPU_PXA26x
275 select FB_W100
276 275
277config MACH_MAGICIAN 276config MACH_MAGICIAN
278 bool "Enable HTC Magician Support" 277 bool "Enable HTC Magician Support"
@@ -454,6 +453,13 @@ config PXA_SHARPSL
454config SHARPSL_PM 453config SHARPSL_PM
455 bool 454 bool
456 select APM_EMULATION 455 select APM_EMULATION
456 select SHARPSL_PM_MAX1111
457
458config SHARPSL_PM_MAX1111
459 bool
460 depends on !CORGI_SSP_DEPRECATED
461 select HWMON
462 select SENSORS_MAX1111
457 463
458config CORGI_SSP_DEPRECATED 464config CORGI_SSP_DEPRECATED
459 bool 465 bool
@@ -547,7 +553,6 @@ config MACH_E740
547 bool "Toshiba e740" 553 bool "Toshiba e740"
548 default y 554 default y
549 depends on ARCH_PXA_ESERIES 555 depends on ARCH_PXA_ESERIES
550 select FB_W100
551 help 556 help
552 Say Y here if you intend to run this kernel on a Toshiba 557 Say Y here if you intend to run this kernel on a Toshiba
553 e740 family PDA. 558 e740 family PDA.
@@ -556,7 +561,6 @@ config MACH_E750
556 bool "Toshiba e750" 561 bool "Toshiba e750"
557 default y 562 default y
558 depends on ARCH_PXA_ESERIES 563 depends on ARCH_PXA_ESERIES
559 select FB_W100
560 help 564 help
561 Say Y here if you intend to run this kernel on a Toshiba 565 Say Y here if you intend to run this kernel on a Toshiba
562 e750 family PDA. 566 e750 family PDA.
@@ -573,7 +577,6 @@ config MACH_E800
573 bool "Toshiba e800" 577 bool "Toshiba e800"
574 default y 578 default y
575 depends on ARCH_PXA_ESERIES 579 depends on ARCH_PXA_ESERIES
576 select FB_W100
577 help 580 help
578 Say Y here if you intend to run this kernel on a Toshiba 581 Say Y here if you intend to run this kernel on a Toshiba
579 e800 family PDA. 582 e800 family PDA.
diff --git a/arch/arm/mach-pxa/imote2.c b/arch/arm/mach-pxa/imote2.c
index b2f878bd460b..5161dca8ccc0 100644
--- a/arch/arm/mach-pxa/imote2.c
+++ b/arch/arm/mach-pxa/imote2.c
@@ -559,10 +559,6 @@ static void __init imote2_init(void)
559 pxa_set_btuart_info(NULL); 559 pxa_set_btuart_info(NULL);
560 pxa_set_stuart_info(NULL); 560 pxa_set_stuart_info(NULL);
561 561
562 /* SPI chip select directions - all other directions should
563 * be handled by drivers.*/
564 gpio_direction_output(37, 0);
565
566 platform_add_devices(imote2_devices, ARRAY_SIZE(imote2_devices)); 562 platform_add_devices(imote2_devices, ARRAY_SIZE(imote2_devices));
567 563
568 pxa2xx_set_spi_info(1, &pxa_ssp_master_0_info); 564 pxa2xx_set_spi_info(1, &pxa_ssp_master_0_info);
diff --git a/arch/arm/mach-pxa/include/mach/uncompress.h b/arch/arm/mach-pxa/include/mach/uncompress.h
index 5ef91d9d17e4..759b851ec985 100644
--- a/arch/arm/mach-pxa/include/mach/uncompress.h
+++ b/arch/arm/mach-pxa/include/mach/uncompress.h
@@ -16,9 +16,9 @@
16#define BTUART_BASE (0x40200000) 16#define BTUART_BASE (0x40200000)
17#define STUART_BASE (0x40700000) 17#define STUART_BASE (0x40700000)
18 18
19static unsigned long uart_base = FFUART_BASE; 19static unsigned long uart_base;
20static unsigned int uart_shift = 2; 20static unsigned int uart_shift;
21static unsigned int uart_is_pxa = 1; 21static unsigned int uart_is_pxa;
22 22
23static inline unsigned char uart_read(int offset) 23static inline unsigned char uart_read(int offset)
24{ 24{
@@ -56,6 +56,11 @@ static inline void flush(void)
56 56
57static inline void arch_decomp_setup(void) 57static inline void arch_decomp_setup(void)
58{ 58{
59 /* initialize to default */
60 uart_base = FFUART_BASE;
61 uart_shift = 2;
62 uart_is_pxa = 1;
63
59 if (machine_is_littleton() || machine_is_intelmote2() 64 if (machine_is_littleton() || machine_is_intelmote2()
60 || machine_is_csb726() || machine_is_stargate2() 65 || machine_is_csb726() || machine_is_stargate2()
61 || machine_is_cm_x300() || machine_is_balloon3()) 66 || machine_is_cm_x300() || machine_is_balloon3())
diff --git a/arch/arm/mach-pxa/raumfeld.c b/arch/arm/mach-pxa/raumfeld.c
index 3184bdc14526..44bb675e47f1 100644
--- a/arch/arm/mach-pxa/raumfeld.c
+++ b/arch/arm/mach-pxa/raumfeld.c
@@ -37,8 +37,6 @@
37#include <linux/lis3lv02d.h> 37#include <linux/lis3lv02d.h>
38#include <linux/pda_power.h> 38#include <linux/pda_power.h>
39#include <linux/power_supply.h> 39#include <linux/power_supply.h>
40#include <linux/pda_power.h>
41#include <linux/power_supply.h>
42#include <linux/regulator/max8660.h> 40#include <linux/regulator/max8660.h>
43#include <linux/regulator/machine.h> 41#include <linux/regulator/machine.h>
44#include <linux/regulator/fixed.h> 42#include <linux/regulator/fixed.h>
@@ -444,7 +442,7 @@ static struct gpio_keys_button gpio_keys_button[] = {
444 .active_low = 0, 442 .active_low = 0,
445 .wakeup = 0, 443 .wakeup = 0,
446 .debounce_interval = 5, /* ms */ 444 .debounce_interval = 5, /* ms */
447 .desc = "on/off button", 445 .desc = "on_off button",
448 }, 446 },
449}; 447};
450 448
diff --git a/arch/arm/mach-pxa/stargate2.c b/arch/arm/mach-pxa/stargate2.c
index a98a434f0111..2041eb1d90ba 100644
--- a/arch/arm/mach-pxa/stargate2.c
+++ b/arch/arm/mach-pxa/stargate2.c
@@ -764,11 +764,6 @@ static void __init stargate2_init(void)
764 pxa_set_btuart_info(NULL); 764 pxa_set_btuart_info(NULL);
765 pxa_set_stuart_info(NULL); 765 pxa_set_stuart_info(NULL);
766 766
767 /* spi chip selects */
768 gpio_direction_output(37, 0);
769 gpio_direction_output(24, 0);
770 gpio_direction_output(39, 0);
771
772 platform_add_devices(ARRAY_AND_SIZE(stargate2_devices)); 767 platform_add_devices(ARRAY_AND_SIZE(stargate2_devices));
773 768
774 pxa2xx_set_spi_info(1, &pxa_ssp_master_0_info); 769 pxa2xx_set_spi_info(1, &pxa_ssp_master_0_info);
diff --git a/arch/arm/tools/mach-types b/arch/arm/tools/mach-types
index 31c2f4c30a95..1536f1784cac 100644
--- a/arch/arm/tools/mach-types
+++ b/arch/arm/tools/mach-types
@@ -12,7 +12,7 @@
12# 12#
13# http://www.arm.linux.org.uk/developer/machines/?action=new 13# http://www.arm.linux.org.uk/developer/machines/?action=new
14# 14#
15# Last update: Sat Feb 20 14:16:15 2010 15# Last update: Sat Mar 20 15:35:41 2010
16# 16#
17# machine_is_xxx CONFIG_xxxx MACH_TYPE_xxx number 17# machine_is_xxx CONFIG_xxxx MACH_TYPE_xxx number
18# 18#
@@ -2663,7 +2663,7 @@ reb01 MACH_REB01 REB01 2675
2663aquila MACH_AQUILA AQUILA 2676 2663aquila MACH_AQUILA AQUILA 2676
2664spark_sls_hw2 MACH_SPARK_SLS_HW2 SPARK_SLS_HW2 2677 2664spark_sls_hw2 MACH_SPARK_SLS_HW2 SPARK_SLS_HW2 2677
2665sheeva_esata MACH_ESATA_SHEEVAPLUG ESATA_SHEEVAPLUG 2678 2665sheeva_esata MACH_ESATA_SHEEVAPLUG ESATA_SHEEVAPLUG 2678
2666surf7x30 MACH_SURF7X30 SURF7X30 2679 2666msm7x30_surf MACH_MSM7X30_SURF MSM7X30_SURF 2679
2667micro2440 MACH_MICRO2440 MICRO2440 2680 2667micro2440 MACH_MICRO2440 MICRO2440 2680
2668am2440 MACH_AM2440 AM2440 2681 2668am2440 MACH_AM2440 AM2440 2681
2669tq2440 MACH_TQ2440 TQ2440 2682 2669tq2440 MACH_TQ2440 TQ2440 2682
@@ -2678,3 +2678,74 @@ vc088x MACH_VC088X VC088X 2690
2678mioa702 MACH_MIOA702 MIOA702 2691 2678mioa702 MACH_MIOA702 MIOA702 2691
2679hpmin MACH_HPMIN HPMIN 2692 2679hpmin MACH_HPMIN HPMIN 2692
2680ak880xak MACH_AK880XAK AK880XAK 2693 2680ak880xak MACH_AK880XAK AK880XAK 2693
2681arm926tomap850 MACH_ARM926TOMAP850 ARM926TOMAP850 2694
2682lkevm MACH_LKEVM LKEVM 2695
2683mw6410 MACH_MW6410 MW6410 2696
2684terastation_wxl MACH_TERASTATION_WXL TERASTATION_WXL 2697
2685cpu8000e MACH_CPU8000E CPU8000E 2698
2686catania MACH_CATANIA CATANIA 2699
2687tokyo MACH_TOKYO TOKYO 2700
2688msm7201a_surf MACH_MSM7201A_SURF MSM7201A_SURF 2701
2689msm7201a_ffa MACH_MSM7201A_FFA MSM7201A_FFA 2702
2690msm7x25_surf MACH_MSM7X25_SURF MSM7X25_SURF 2703
2691msm7x25_ffa MACH_MSM7X25_FFA MSM7X25_FFA 2704
2692msm7x27_surf MACH_MSM7X27_SURF MSM7X27_SURF 2705
2693msm7x27_ffa MACH_MSM7X27_FFA MSM7X27_FFA 2706
2694msm7x30_ffa MACH_MSM7X30_FFA MSM7X30_FFA 2707
2695qsd8x50_surf MACH_QSD8X50_SURF QSD8X50_SURF 2708
2696qsd8x50_comet MACH_QSD8X50_COMET QSD8X50_COMET 2709
2697qsd8x50_ffa MACH_QSD8X50_FFA QSD8X50_FFA 2710
2698qsd8x50a_surf MACH_QSD8X50A_SURF QSD8X50A_SURF 2711
2699qsd8x50a_ffa MACH_QSD8X50A_FFA QSD8X50A_FFA 2712
2700adx_xgcp10 MACH_ADX_XGCP10 ADX_XGCP10 2713
2701mcgwumts2a MACH_MCGWUMTS2A MCGWUMTS2A 2714
2702mobikt MACH_MOBIKT MOBIKT 2715
2703mx53_evk MACH_MX53_EVK MX53_EVK 2716
2704igep0030 MACH_IGEP0030 IGEP0030 2717
2705axell_h40_h50_ctrl MACH_AXELL_H40_H50_CTRL AXELL_H40_H50_CTRL 2718
2706dtcommod MACH_DTCOMMOD DTCOMMOD 2719
2707gould MACH_GOULD GOULD 2720
2708siberia MACH_SIBERIA SIBERIA 2721
2709sbc3530 MACH_SBC3530 SBC3530 2722
2710qarm MACH_QARM QARM 2723
2711mips MACH_MIPS MIPS 2724
2712mx27grb MACH_MX27GRB MX27GRB 2725
2713sbc8100 MACH_SBC8100 SBC8100 2726
2714saarb MACH_SAARB SAARB 2727
2715omap3mini MACH_OMAP3MINI OMAP3MINI 2728
2716cnmbook7se MACH_CNMBOOK7SE CNMBOOK7SE 2729
2717catan MACH_CATAN CATAN 2730
2718harmony MACH_HARMONY HARMONY 2731
2719tonga MACH_TONGA TONGA 2732
2720cybook_orizon MACH_CYBOOK_ORIZON CYBOOK_ORIZON 2733
2721htcrhodiumcdma MACH_HTCRHODIUMCDMA HTCRHODIUMCDMA 2734
2722epc_g45 MACH_EPC_G45 EPC_G45 2735
2723epc_lpc3250 MACH_EPC_LPC3250 EPC_LPC3250 2736
2724mxc91341evb MACH_MXC91341EVB MXC91341EVB 2737
2725rtw1000 MACH_RTW1000 RTW1000 2738
2726bobcat MACH_BOBCAT BOBCAT 2739
2727trizeps6 MACH_TRIZEPS6 TRIZEPS6 2740
2728msm7x30_fluid MACH_MSM7X30_FLUID MSM7X30_FLUID 2741
2729nedap9263 MACH_NEDAP9263 NEDAP9263 2742
2730netgear_ms2110 MACH_NETGEAR_MS2110 NETGEAR_MS2110 2743
2731bmx MACH_BMX BMX 2744
2732netstream MACH_NETSTREAM NETSTREAM 2745
2733vpnext_rcu MACH_VPNEXT_RCU VPNEXT_RCU 2746
2734vpnext_mpu MACH_VPNEXT_MPU VPNEXT_MPU 2747
2735bcmring_tablet_v1 MACH_BCMRING_TABLET_V1 BCMRING_TABLET_V1 2748
2736sgarm10 MACH_SGARM10 SGARM10 2749
2737cm_t3517 MACH_CM_T3517 CM_T3517 2750
2738omap3_cps MACH_OMAP3_CPS OMAP3_CPS 2751
2739axar1500_receiver MACH_AXAR1500_RECEIVER AXAR1500_RECEIVER 2752
2740wbd222 MACH_WBD222 WBD222 2753
2741mt65xx MACH_MT65XX MT65XX 2754
2742msm8x60_surf MACH_MSM8X60_SURF MSM8X60_SURF 2755
2743msm8x60_sim MACH_MSM8X60_SIM MSM8X60_SIM 2756
2744vmc300 MACH_VMC300 VMC300 2757
2745tcc8000_sdk MACH_TCC8000_SDK TCC8000_SDK 2758
2746nanos MACH_NANOS NANOS 2759
2747stamp9g10 MACH_STAMP9G10 STAMP9G10 2760
2748stamp9g45 MACH_STAMP9G45 STAMP9G45 2761
2749h6053 MACH_H6053 H6053 2762
2750smint01 MACH_SMINT01 SMINT01 2763
2751prtlvt2 MACH_PRTLVT2 PRTLVT2 2764
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 8a54eb8e3768..2e19500921f9 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -313,19 +313,6 @@ config 8XX_MINIMAL_FPEMU
313 313
314 It is recommended that you build a soft-float userspace instead. 314 It is recommended that you build a soft-float userspace instead.
315 315
316config IOMMU_VMERGE
317 bool "Enable IOMMU virtual merging"
318 depends on PPC64
319 default y
320 help
321 Cause IO segments sent to a device for DMA to be merged virtually
322 by the IOMMU when they happen to have been allocated contiguously.
323 This doesn't add pressure to the IOMMU allocator. However, some
324 drivers don't support getting large merged segments coming back
325 from *_map_sg().
326
327 Most drivers don't have this problem; it is safe to say Y here.
328
329config IOMMU_HELPER 316config IOMMU_HELPER
330 def_bool PPC64 317 def_bool PPC64
331 318
diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h
index aea714797590..d553bbeb726c 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -25,7 +25,7 @@
25#define PPC_INST_LDARX 0x7c0000a8 25#define PPC_INST_LDARX 0x7c0000a8
26#define PPC_INST_LSWI 0x7c0004aa 26#define PPC_INST_LSWI 0x7c0004aa
27#define PPC_INST_LSWX 0x7c00042a 27#define PPC_INST_LSWX 0x7c00042a
28#define PPC_INST_LWARX 0x7c000029 28#define PPC_INST_LWARX 0x7c000028
29#define PPC_INST_LWSYNC 0x7c2004ac 29#define PPC_INST_LWSYNC 0x7c2004ac
30#define PPC_INST_LXVD2X 0x7c000698 30#define PPC_INST_LXVD2X 0x7c000698
31#define PPC_INST_MCRXR 0x7c000400 31#define PPC_INST_MCRXR 0x7c000400
@@ -62,8 +62,8 @@
62#define __PPC_T_TLB(t) (((t) & 0x3) << 21) 62#define __PPC_T_TLB(t) (((t) & 0x3) << 21)
63#define __PPC_WC(w) (((w) & 0x3) << 21) 63#define __PPC_WC(w) (((w) & 0x3) << 21)
64/* 64/*
65 * Only use the larx hint bit on 64bit CPUs. Once we verify it doesn't have 65 * Only use the larx hint bit on 64bit CPUs. e500v1/v2 based CPUs will treat a
66 * any side effects on all 32bit processors, we can do this all the time. 66 * larx with EH set as an illegal instruction.
67 */ 67 */
68#ifdef CONFIG_PPC64 68#ifdef CONFIG_PPC64
69#define __PPC_EH(eh) (((eh) & 0x1) << 0) 69#define __PPC_EH(eh) (((eh) & 0x1) << 0)
diff --git a/arch/powerpc/include/asm/syscall.h b/arch/powerpc/include/asm/syscall.h
index efa7f0b879f3..23913e902fc3 100644
--- a/arch/powerpc/include/asm/syscall.h
+++ b/arch/powerpc/include/asm/syscall.h
@@ -30,7 +30,7 @@ static inline void syscall_rollback(struct task_struct *task,
30static inline long syscall_get_error(struct task_struct *task, 30static inline long syscall_get_error(struct task_struct *task,
31 struct pt_regs *regs) 31 struct pt_regs *regs)
32{ 32{
33 return (regs->ccr & 0x1000) ? -regs->gpr[3] : 0; 33 return (regs->ccr & 0x10000000) ? -regs->gpr[3] : 0;
34} 34}
35 35
36static inline long syscall_get_return_value(struct task_struct *task, 36static inline long syscall_get_return_value(struct task_struct *task,
@@ -44,10 +44,10 @@ static inline void syscall_set_return_value(struct task_struct *task,
44 int error, long val) 44 int error, long val)
45{ 45{
46 if (error) { 46 if (error) {
47 regs->ccr |= 0x1000L; 47 regs->ccr |= 0x10000000L;
48 regs->gpr[3] = -error; 48 regs->gpr[3] = -error;
49 } else { 49 } else {
50 regs->ccr &= ~0x1000L; 50 regs->ccr &= ~0x10000000L;
51 regs->gpr[3] = val; 51 regs->gpr[3] = val;
52 } 52 }
53} 53}
diff --git a/arch/powerpc/kernel/head_fsl_booke.S b/arch/powerpc/kernel/head_fsl_booke.S
index 25793bb0e782..725526547994 100644
--- a/arch/powerpc/kernel/head_fsl_booke.S
+++ b/arch/powerpc/kernel/head_fsl_booke.S
@@ -747,9 +747,6 @@ finish_tlb_load:
747#else 747#else
748 rlwimi r12, r11, 26, 27, 31 /* extract WIMGE from pte */ 748 rlwimi r12, r11, 26, 27, 31 /* extract WIMGE from pte */
749#endif 749#endif
750#ifdef CONFIG_SMP
751 ori r12, r12, MAS2_M
752#endif
753 mtspr SPRN_MAS2, r12 750 mtspr SPRN_MAS2, r12
754 751
755#ifdef CONFIG_PTE_64BIT 752#ifdef CONFIG_PTE_64BIT
@@ -887,13 +884,17 @@ KernelSPE:
887 lwz r3,_MSR(r1) 884 lwz r3,_MSR(r1)
888 oris r3,r3,MSR_SPE@h 885 oris r3,r3,MSR_SPE@h
889 stw r3,_MSR(r1) /* enable use of SPE after return */ 886 stw r3,_MSR(r1) /* enable use of SPE after return */
887#ifdef CONFIG_PRINTK
890 lis r3,87f@h 888 lis r3,87f@h
891 ori r3,r3,87f@l 889 ori r3,r3,87f@l
892 mr r4,r2 /* current */ 890 mr r4,r2 /* current */
893 lwz r5,_NIP(r1) 891 lwz r5,_NIP(r1)
894 bl printk 892 bl printk
893#endif
895 b ret_from_except 894 b ret_from_except
895#ifdef CONFIG_PRINTK
89687: .string "SPE used in kernel (task=%p, pc=%x) \n" 89687: .string "SPE used in kernel (task=%p, pc=%x) \n"
897#endif
897 .align 4,0 898 .align 4,0
898 899
899#endif /* CONFIG_SPE */ 900#endif /* CONFIG_SPE */
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 5547ae6e6b0b..ec94f906ea43 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -42,12 +42,7 @@
42 42
43#define DBG(...) 43#define DBG(...)
44 44
45#ifdef CONFIG_IOMMU_VMERGE 45static int novmerge;
46static int novmerge = 0;
47#else
48static int novmerge = 1;
49#endif
50
51static int protect4gb = 1; 46static int protect4gb = 1;
52 47
53static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned int); 48static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned int);
diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c
index b152de3e64d4..8f58986c2ad9 100644
--- a/arch/powerpc/kernel/setup_32.c
+++ b/arch/powerpc/kernel/setup_32.c
@@ -39,7 +39,6 @@
39#include <asm/serial.h> 39#include <asm/serial.h>
40#include <asm/udbg.h> 40#include <asm/udbg.h>
41#include <asm/mmu_context.h> 41#include <asm/mmu_context.h>
42#include <asm/swiotlb.h>
43 42
44#include "setup.h" 43#include "setup.h"
45 44
@@ -343,11 +342,6 @@ void __init setup_arch(char **cmdline_p)
343 ppc_md.setup_arch(); 342 ppc_md.setup_arch();
344 if ( ppc_md.progress ) ppc_md.progress("arch: exit", 0x3eab); 343 if ( ppc_md.progress ) ppc_md.progress("arch: exit", 0x3eab);
345 344
346#ifdef CONFIG_SWIOTLB
347 if (ppc_swiotlb_enable)
348 swiotlb_init(1);
349#endif
350
351 paging_init(); 345 paging_init();
352 346
353 /* Initialize the MMU context management stuff */ 347 /* Initialize the MMU context management stuff */
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 63547394048c..914389158a9b 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -61,7 +61,6 @@
61#include <asm/xmon.h> 61#include <asm/xmon.h>
62#include <asm/udbg.h> 62#include <asm/udbg.h>
63#include <asm/kexec.h> 63#include <asm/kexec.h>
64#include <asm/swiotlb.h>
65#include <asm/mmu_context.h> 64#include <asm/mmu_context.h>
66 65
67#include "setup.h" 66#include "setup.h"
@@ -541,11 +540,6 @@ void __init setup_arch(char **cmdline_p)
541 if (ppc_md.setup_arch) 540 if (ppc_md.setup_arch)
542 ppc_md.setup_arch(); 541 ppc_md.setup_arch();
543 542
544#ifdef CONFIG_SWIOTLB
545 if (ppc_swiotlb_enable)
546 swiotlb_init(1);
547#endif
548
549 paging_init(); 543 paging_init();
550 544
551 /* Initialize the MMU context management stuff */ 545 /* Initialize the MMU context management stuff */
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 311224cdb7ad..448f972b22f5 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -48,6 +48,7 @@
48#include <asm/sparsemem.h> 48#include <asm/sparsemem.h>
49#include <asm/vdso.h> 49#include <asm/vdso.h>
50#include <asm/fixmap.h> 50#include <asm/fixmap.h>
51#include <asm/swiotlb.h>
51 52
52#include "mmu_decl.h" 53#include "mmu_decl.h"
53 54
@@ -320,6 +321,11 @@ void __init mem_init(void)
320 struct page *page; 321 struct page *page;
321 unsigned long reservedpages = 0, codesize, initsize, datasize, bsssize; 322 unsigned long reservedpages = 0, codesize, initsize, datasize, bsssize;
322 323
324#ifdef CONFIG_SWIOTLB
325 if (ppc_swiotlb_enable)
326 swiotlb_init(1);
327#endif
328
323 num_physpages = lmb.memory.size >> PAGE_SHIFT; 329 num_physpages = lmb.memory.size >> PAGE_SHIFT;
324 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); 330 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
325 331
diff --git a/arch/s390/boot/compressed/misc.c b/arch/s390/boot/compressed/misc.c
index a97d69525829..14e0479d3888 100644
--- a/arch/s390/boot/compressed/misc.c
+++ b/arch/s390/boot/compressed/misc.c
@@ -24,8 +24,8 @@
24/* Symbols defined by linker scripts */ 24/* Symbols defined by linker scripts */
25extern char input_data[]; 25extern char input_data[];
26extern int input_len; 26extern int input_len;
27extern int _text; 27extern char _text, _end;
28extern int _end; 28extern char _bss, _ebss;
29 29
30static void error(char *m); 30static void error(char *m);
31 31
@@ -129,12 +129,12 @@ unsigned long decompress_kernel(void)
129 unsigned long output_addr; 129 unsigned long output_addr;
130 unsigned char *output; 130 unsigned char *output;
131 131
132 check_ipl_parmblock((void *) 0, (unsigned long) output + SZ__bss_start);
133 memset(&_bss, 0, &_ebss - &_bss);
132 free_mem_ptr = (unsigned long)&_end; 134 free_mem_ptr = (unsigned long)&_end;
133 free_mem_end_ptr = free_mem_ptr + HEAP_SIZE; 135 free_mem_end_ptr = free_mem_ptr + HEAP_SIZE;
134 output = (unsigned char *) ((free_mem_end_ptr + 4095UL) & -4096UL); 136 output = (unsigned char *) ((free_mem_end_ptr + 4095UL) & -4096UL);
135 137
136 check_ipl_parmblock((void *) 0, (unsigned long) output + SZ__bss_start);
137
138#ifdef CONFIG_BLK_DEV_INITRD 138#ifdef CONFIG_BLK_DEV_INITRD
139 /* 139 /*
140 * Move the initrd right behind the end of the decompressed 140 * Move the initrd right behind the end of the decompressed
diff --git a/arch/s390/include/asm/system.h b/arch/s390/include/asm/system.h
index 67ee6c3c6bb3..1741c1556a4e 100644
--- a/arch/s390/include/asm/system.h
+++ b/arch/s390/include/asm/system.h
@@ -110,6 +110,7 @@ extern void pfault_fini(void);
110#endif /* CONFIG_PFAULT */ 110#endif /* CONFIG_PFAULT */
111 111
112extern void cmma_init(void); 112extern void cmma_init(void);
113extern int memcpy_real(void *, void *, size_t);
113 114
114#define finish_arch_switch(prev) do { \ 115#define finish_arch_switch(prev) do { \
115 set_fs(current->thread.mm_segment); \ 116 set_fs(current->thread.mm_segment); \
@@ -218,8 +219,8 @@ __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new, int size)
218 " l %0,%2\n" 219 " l %0,%2\n"
219 "0: nr %0,%5\n" 220 "0: nr %0,%5\n"
220 " lr %1,%0\n" 221 " lr %1,%0\n"
221 " or %0,%2\n" 222 " or %0,%3\n"
222 " or %1,%3\n" 223 " or %1,%4\n"
223 " cs %0,%1,%2\n" 224 " cs %0,%1,%2\n"
224 " jnl 1f\n" 225 " jnl 1f\n"
225 " xr %1,%0\n" 226 " xr %1,%0\n"
@@ -239,8 +240,8 @@ __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new, int size)
239 " l %0,%2\n" 240 " l %0,%2\n"
240 "0: nr %0,%5\n" 241 "0: nr %0,%5\n"
241 " lr %1,%0\n" 242 " lr %1,%0\n"
242 " or %0,%2\n" 243 " or %0,%3\n"
243 " or %1,%3\n" 244 " or %1,%4\n"
244 " cs %0,%1,%2\n" 245 " cs %0,%1,%2\n"
245 " jnl 1f\n" 246 " jnl 1f\n"
246 " xr %1,%0\n" 247 " xr %1,%0\n"
diff --git a/arch/s390/kernel/head.S b/arch/s390/kernel/head.S
index ca4a62bd862f..9d1f76702d47 100644
--- a/arch/s390/kernel/head.S
+++ b/arch/s390/kernel/head.S
@@ -517,7 +517,10 @@ startup:
517 lhi %r1,2 # mode 2 = esame (dump) 517 lhi %r1,2 # mode 2 = esame (dump)
518 sigp %r1,%r0,0x12 # switch to esame mode 518 sigp %r1,%r0,0x12 # switch to esame mode
519 sam64 # switch to 64 bit mode 519 sam64 # switch to 64 bit mode
520 larl %r13,4f
521 lmh %r0,%r15,0(%r13) # clear high-order half
520 jg startup_continue 522 jg startup_continue
5234: .fill 16,4,0x0
521#else 524#else
522 mvi __LC_AR_MODE_ID,0 # set ESA flag (mode 0) 525 mvi __LC_AR_MODE_ID,0 # set ESA flag (mode 0)
523 l %r13,4f-.LPG0(%r13) 526 l %r13,4f-.LPG0(%r13)
diff --git a/arch/s390/kernel/head64.S b/arch/s390/kernel/head64.S
index 39580e768658..1f70970de0aa 100644
--- a/arch/s390/kernel/head64.S
+++ b/arch/s390/kernel/head64.S
@@ -21,7 +21,6 @@ startup_continue:
21 larl %r1,sched_clock_base_cc 21 larl %r1,sched_clock_base_cc
22 mvc 0(8,%r1),__LC_LAST_UPDATE_CLOCK 22 mvc 0(8,%r1),__LC_LAST_UPDATE_CLOCK
23 larl %r13,.LPG1 # get base 23 larl %r13,.LPG1 # get base
24 lmh %r0,%r15,.Lzero64-.LPG1(%r13) # clear high-order half
25 lctlg %c0,%c15,.Lctl-.LPG1(%r13) # load control registers 24 lctlg %c0,%c15,.Lctl-.LPG1(%r13) # load control registers
26 lg %r12,.Lparmaddr-.LPG1(%r13) # pointer to parameter area 25 lg %r12,.Lparmaddr-.LPG1(%r13) # pointer to parameter area
27 # move IPL device to lowcore 26 # move IPL device to lowcore
@@ -67,7 +66,6 @@ startup_continue:
67.L4malign:.quad 0xffffffffffc00000 66.L4malign:.quad 0xffffffffffc00000
68.Lscan2g:.quad 0x80000000 + 0x20000 - 8 # 2GB + 128K - 8 67.Lscan2g:.quad 0x80000000 + 0x20000 - 8 # 2GB + 128K - 8
69.Lnop: .long 0x07000700 68.Lnop: .long 0x07000700
70.Lzero64:.fill 16,4,0x0
71.Lparmaddr: 69.Lparmaddr:
72 .quad PARMAREA 70 .quad PARMAREA
73 .align 64 71 .align 64
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index 77a63ae419f0..ba363d99de43 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -401,7 +401,7 @@ setup_lowcore(void)
401 * Setup lowcore for boot cpu 401 * Setup lowcore for boot cpu
402 */ 402 */
403 BUILD_BUG_ON(sizeof(struct _lowcore) != LC_PAGES * 4096); 403 BUILD_BUG_ON(sizeof(struct _lowcore) != LC_PAGES * 4096);
404 lc = __alloc_bootmem(LC_PAGES * PAGE_SIZE, LC_PAGES * PAGE_SIZE, 0); 404 lc = __alloc_bootmem_low(LC_PAGES * PAGE_SIZE, LC_PAGES * PAGE_SIZE, 0);
405 lc->restart_psw.mask = PSW_BASE_BITS | PSW_DEFAULT_KEY; 405 lc->restart_psw.mask = PSW_BASE_BITS | PSW_DEFAULT_KEY;
406 lc->restart_psw.addr = 406 lc->restart_psw.addr =
407 PSW_ADDR_AMODE | (unsigned long) restart_int_handler; 407 PSW_ADDR_AMODE | (unsigned long) restart_int_handler;
@@ -433,7 +433,7 @@ setup_lowcore(void)
433#ifndef CONFIG_64BIT 433#ifndef CONFIG_64BIT
434 if (MACHINE_HAS_IEEE) { 434 if (MACHINE_HAS_IEEE) {
435 lc->extended_save_area_addr = (__u32) 435 lc->extended_save_area_addr = (__u32)
436 __alloc_bootmem(PAGE_SIZE, PAGE_SIZE, 0); 436 __alloc_bootmem_low(PAGE_SIZE, PAGE_SIZE, 0);
437 /* enable extended save area */ 437 /* enable extended save area */
438 __ctl_set_bit(14, 29); 438 __ctl_set_bit(14, 29);
439 } 439 }
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 29f65bce55e1..d7d24fc3d6b7 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -292,9 +292,9 @@ static void __init smp_get_save_area(unsigned int cpu, unsigned int phy_cpu)
292 zfcpdump_save_areas[cpu] = kmalloc(sizeof(struct save_area), GFP_KERNEL); 292 zfcpdump_save_areas[cpu] = kmalloc(sizeof(struct save_area), GFP_KERNEL);
293 while (raw_sigp(phy_cpu, sigp_stop_and_store_status) == sigp_busy) 293 while (raw_sigp(phy_cpu, sigp_stop_and_store_status) == sigp_busy)
294 cpu_relax(); 294 cpu_relax();
295 memcpy(zfcpdump_save_areas[cpu], 295 memcpy_real(zfcpdump_save_areas[cpu],
296 (void *)(unsigned long) store_prefix() + SAVE_AREA_BASE, 296 (void *)(unsigned long) store_prefix() + SAVE_AREA_BASE,
297 sizeof(struct save_area)); 297 sizeof(struct save_area));
298} 298}
299 299
300struct save_area *zfcpdump_save_areas[NR_CPUS + 1]; 300struct save_area *zfcpdump_save_areas[NR_CPUS + 1];
diff --git a/arch/s390/mm/maccess.c b/arch/s390/mm/maccess.c
index 81756271dc44..a8c2af8c650f 100644
--- a/arch/s390/mm/maccess.c
+++ b/arch/s390/mm/maccess.c
@@ -59,3 +59,29 @@ long probe_kernel_write(void *dst, void *src, size_t size)
59 } 59 }
60 return copied < 0 ? -EFAULT : 0; 60 return copied < 0 ? -EFAULT : 0;
61} 61}
62
63int memcpy_real(void *dest, void *src, size_t count)
64{
65 register unsigned long _dest asm("2") = (unsigned long) dest;
66 register unsigned long _len1 asm("3") = (unsigned long) count;
67 register unsigned long _src asm("4") = (unsigned long) src;
68 register unsigned long _len2 asm("5") = (unsigned long) count;
69 unsigned long flags;
70 int rc = -EFAULT;
71
72 if (!count)
73 return 0;
74 flags = __raw_local_irq_stnsm(0xf8UL);
75 asm volatile (
76 "0: mvcle %1,%2,0x0\n"
77 "1: jo 0b\n"
78 " lhi %0,0x0\n"
79 "2:\n"
80 EX_TABLE(1b,2b)
81 : "+d" (rc), "+d" (_dest), "+d" (_src), "+d" (_len1),
82 "+d" (_len2), "=m" (*((long *) dest))
83 : "m" (*((long *) src))
84 : "cc", "memory");
85 __raw_local_irq_ssm(flags);
86 return rc;
87}
diff --git a/arch/sh/boards/mach-ecovec24/setup.c b/arch/sh/boards/mach-ecovec24/setup.c
index 39ed8722d11a..6c13b92742e8 100644
--- a/arch/sh/boards/mach-ecovec24/setup.c
+++ b/arch/sh/boards/mach-ecovec24/setup.c
@@ -836,6 +836,8 @@ static void __init sh_eth_init(struct sh_eth_plat_data *pd)
836 pd->mac_addr[i] = mac_read(a, 0x10 + i); 836 pd->mac_addr[i] = mac_read(a, 0x10 + i);
837 msleep(10); 837 msleep(10);
838 } 838 }
839
840 i2c_put_adapter(a);
839} 841}
840#else 842#else
841static void __init sh_eth_init(struct sh_eth_plat_data *pd) 843static void __init sh_eth_init(struct sh_eth_plat_data *pd)
diff --git a/arch/sh/boards/mach-se/7724/setup.c b/arch/sh/boards/mach-se/7724/setup.c
index 66cdbc3c7af9..ccaa290e9aba 100644
--- a/arch/sh/boards/mach-se/7724/setup.c
+++ b/arch/sh/boards/mach-se/7724/setup.c
@@ -52,6 +52,13 @@
52 * and change SW41 to use 720p 52 * and change SW41 to use 720p
53 */ 53 */
54 54
55/*
56 * about sound
57 *
58 * This setup.c supports FSI slave mode.
59 * Please change J20, J21, J22 pin to 1-2 connection.
60 */
61
55/* Heartbeat */ 62/* Heartbeat */
56static struct resource heartbeat_resource = { 63static struct resource heartbeat_resource = {
57 .start = PA_LED, 64 .start = PA_LED,
@@ -276,6 +283,7 @@ static struct clk fsimcka_clk = {
276 .rate = 0, /* unknown */ 283 .rate = 0, /* unknown */
277}; 284};
278 285
286/* change J20, J21, J22 pin to 1-2 connection to use slave mode */
279struct sh_fsi_platform_info fsi_info = { 287struct sh_fsi_platform_info fsi_info = {
280 .porta_flags = SH_FSI_BRS_INV | 288 .porta_flags = SH_FSI_BRS_INV |
281 SH_FSI_OUT_SLAVE_MODE | 289 SH_FSI_OUT_SLAVE_MODE |
diff --git a/arch/sh/configs/ecovec24_defconfig b/arch/sh/configs/ecovec24_defconfig
index 18e3356406f3..6041c66dd10e 100644
--- a/arch/sh/configs/ecovec24_defconfig
+++ b/arch/sh/configs/ecovec24_defconfig
@@ -1,7 +1,7 @@
1# 1#
2# Automatically generated make config: don't edit 2# Automatically generated make config: don't edit
3# Linux kernel version: 2.6.33-rc2 3# Linux kernel version: 2.6.34-rc2
4# Mon Jan 4 11:20:36 2010 4# Mon Mar 29 02:21:58 2010
5# 5#
6CONFIG_SUPERH=y 6CONFIG_SUPERH=y
7CONFIG_SUPERH32=y 7CONFIG_SUPERH32=y
@@ -13,8 +13,8 @@ CONFIG_GENERIC_FIND_NEXT_BIT=y
13CONFIG_GENERIC_HWEIGHT=y 13CONFIG_GENERIC_HWEIGHT=y
14CONFIG_GENERIC_HARDIRQS=y 14CONFIG_GENERIC_HARDIRQS=y
15CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ=y 15CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ=y
16CONFIG_GENERIC_IRQ_PROBE=y
17CONFIG_IRQ_PER_CPU=y 16CONFIG_IRQ_PER_CPU=y
17CONFIG_SPARSE_IRQ=y
18CONFIG_GENERIC_GPIO=y 18CONFIG_GENERIC_GPIO=y
19CONFIG_GENERIC_TIME=y 19CONFIG_GENERIC_TIME=y
20CONFIG_GENERIC_CLOCKEVENTS=y 20CONFIG_GENERIC_CLOCKEVENTS=y
@@ -32,6 +32,7 @@ CONFIG_ARCH_NO_VIRT_TO_BUS=y
32CONFIG_ARCH_HAS_DEFAULT_IDLE=y 32CONFIG_ARCH_HAS_DEFAULT_IDLE=y
33CONFIG_ARCH_HAS_CPU_IDLE_WAIT=y 33CONFIG_ARCH_HAS_CPU_IDLE_WAIT=y
34CONFIG_DMA_NONCOHERENT=y 34CONFIG_DMA_NONCOHERENT=y
35CONFIG_NEED_DMA_MAP_STATE=y
35CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" 36CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
36CONFIG_CONSTRUCTORS=y 37CONFIG_CONSTRUCTORS=y
37 38
@@ -47,9 +48,11 @@ CONFIG_LOCALVERSION=""
47CONFIG_HAVE_KERNEL_GZIP=y 48CONFIG_HAVE_KERNEL_GZIP=y
48CONFIG_HAVE_KERNEL_BZIP2=y 49CONFIG_HAVE_KERNEL_BZIP2=y
49CONFIG_HAVE_KERNEL_LZMA=y 50CONFIG_HAVE_KERNEL_LZMA=y
51CONFIG_HAVE_KERNEL_LZO=y
50CONFIG_KERNEL_GZIP=y 52CONFIG_KERNEL_GZIP=y
51# CONFIG_KERNEL_BZIP2 is not set 53# CONFIG_KERNEL_BZIP2 is not set
52# CONFIG_KERNEL_LZMA is not set 54# CONFIG_KERNEL_LZMA is not set
55# CONFIG_KERNEL_LZO is not set
53CONFIG_SWAP=y 56CONFIG_SWAP=y
54CONFIG_SYSVIPC=y 57CONFIG_SYSVIPC=y
55CONFIG_SYSVIPC_SYSCTL=y 58CONFIG_SYSVIPC_SYSCTL=y
@@ -71,14 +74,8 @@ CONFIG_RCU_FANOUT=32
71# CONFIG_TREE_RCU_TRACE is not set 74# CONFIG_TREE_RCU_TRACE is not set
72# CONFIG_IKCONFIG is not set 75# CONFIG_IKCONFIG is not set
73CONFIG_LOG_BUF_SHIFT=14 76CONFIG_LOG_BUF_SHIFT=14
74CONFIG_GROUP_SCHED=y
75CONFIG_FAIR_GROUP_SCHED=y
76# CONFIG_RT_GROUP_SCHED is not set
77CONFIG_USER_SCHED=y
78# CONFIG_CGROUP_SCHED is not set
79# CONFIG_CGROUPS is not set 77# CONFIG_CGROUPS is not set
80CONFIG_SYSFS_DEPRECATED=y 78# CONFIG_SYSFS_DEPRECATED_V2 is not set
81CONFIG_SYSFS_DEPRECATED_V2=y
82# CONFIG_RELAY is not set 79# CONFIG_RELAY is not set
83# CONFIG_NAMESPACES is not set 80# CONFIG_NAMESPACES is not set
84# CONFIG_BLK_DEV_INITRD is not set 81# CONFIG_BLK_DEV_INITRD is not set
@@ -107,7 +104,7 @@ CONFIG_PERF_USE_VMALLOC=y
107# 104#
108# Kernel Performance Events And Counters 105# Kernel Performance Events And Counters
109# 106#
110# CONFIG_PERF_EVENTS is not set 107CONFIG_PERF_EVENTS=y
111# CONFIG_PERF_COUNTERS is not set 108# CONFIG_PERF_COUNTERS is not set
112CONFIG_VM_EVENT_COUNTERS=y 109CONFIG_VM_EVENT_COUNTERS=y
113CONFIG_COMPAT_BRK=y 110CONFIG_COMPAT_BRK=y
@@ -116,13 +113,13 @@ CONFIG_SLAB=y
116# CONFIG_SLOB is not set 113# CONFIG_SLOB is not set
117# CONFIG_PROFILING is not set 114# CONFIG_PROFILING is not set
118CONFIG_HAVE_OPROFILE=y 115CONFIG_HAVE_OPROFILE=y
119CONFIG_HAVE_IOREMAP_PROT=y
120CONFIG_HAVE_KPROBES=y 116CONFIG_HAVE_KPROBES=y
121CONFIG_HAVE_KRETPROBES=y 117CONFIG_HAVE_KRETPROBES=y
122CONFIG_HAVE_ARCH_TRACEHOOK=y 118CONFIG_HAVE_ARCH_TRACEHOOK=y
123CONFIG_HAVE_DMA_ATTRS=y 119CONFIG_HAVE_DMA_ATTRS=y
124CONFIG_HAVE_CLK=y 120CONFIG_HAVE_CLK=y
125CONFIG_HAVE_DMA_API_DEBUG=y 121CONFIG_HAVE_DMA_API_DEBUG=y
122CONFIG_HAVE_HW_BREAKPOINT=y
126 123
127# 124#
128# GCOV-based kernel profiling 125# GCOV-based kernel profiling
@@ -234,12 +231,12 @@ CONFIG_CPU_SUBTYPE_SH7724=y
234CONFIG_QUICKLIST=y 231CONFIG_QUICKLIST=y
235CONFIG_MMU=y 232CONFIG_MMU=y
236CONFIG_PAGE_OFFSET=0x80000000 233CONFIG_PAGE_OFFSET=0x80000000
237CONFIG_FORCE_MAX_ZONEORDER=11 234CONFIG_FORCE_MAX_ZONEORDER=12
238CONFIG_MEMORY_START=0x08000000 235CONFIG_MEMORY_START=0x08000000
239CONFIG_MEMORY_SIZE=0x10000000 236CONFIG_MEMORY_SIZE=0x10000000
240CONFIG_29BIT=y 237CONFIG_29BIT=y
241# CONFIG_PMB_ENABLE is not set 238# CONFIG_PMB is not set
242# CONFIG_X2TLB is not set 239CONFIG_X2TLB=y
243CONFIG_VSYSCALL=y 240CONFIG_VSYSCALL=y
244CONFIG_ARCH_FLATMEM_ENABLE=y 241CONFIG_ARCH_FLATMEM_ENABLE=y
245CONFIG_ARCH_SPARSEMEM_ENABLE=y 242CONFIG_ARCH_SPARSEMEM_ENABLE=y
@@ -247,6 +244,8 @@ CONFIG_ARCH_SPARSEMEM_DEFAULT=y
247CONFIG_MAX_ACTIVE_REGIONS=1 244CONFIG_MAX_ACTIVE_REGIONS=1
248CONFIG_ARCH_POPULATES_NODE_MAP=y 245CONFIG_ARCH_POPULATES_NODE_MAP=y
249CONFIG_ARCH_SELECT_MEMORY_MODEL=y 246CONFIG_ARCH_SELECT_MEMORY_MODEL=y
247CONFIG_IOREMAP_FIXED=y
248CONFIG_UNCACHED_MAPPING=y
250CONFIG_PAGE_SIZE_4KB=y 249CONFIG_PAGE_SIZE_4KB=y
251# CONFIG_PAGE_SIZE_8KB is not set 250# CONFIG_PAGE_SIZE_8KB is not set
252# CONFIG_PAGE_SIZE_16KB is not set 251# CONFIG_PAGE_SIZE_16KB is not set
@@ -262,7 +261,7 @@ CONFIG_PAGEFLAGS_EXTENDED=y
262CONFIG_SPLIT_PTLOCK_CPUS=4 261CONFIG_SPLIT_PTLOCK_CPUS=4
263# CONFIG_PHYS_ADDR_T_64BIT is not set 262# CONFIG_PHYS_ADDR_T_64BIT is not set
264CONFIG_ZONE_DMA_FLAG=0 263CONFIG_ZONE_DMA_FLAG=0
265CONFIG_NR_QUICK=2 264CONFIG_NR_QUICK=1
266# CONFIG_KSM is not set 265# CONFIG_KSM is not set
267CONFIG_DEFAULT_MMAP_MIN_ADDR=4096 266CONFIG_DEFAULT_MMAP_MIN_ADDR=4096
268 267
@@ -337,7 +336,6 @@ CONFIG_SECCOMP=y
337# CONFIG_PREEMPT_VOLUNTARY is not set 336# CONFIG_PREEMPT_VOLUNTARY is not set
338CONFIG_PREEMPT=y 337CONFIG_PREEMPT=y
339CONFIG_GUSA=y 338CONFIG_GUSA=y
340# CONFIG_SPARSE_IRQ is not set
341 339
342# 340#
343# Boot options 341# Boot options
@@ -347,7 +345,7 @@ CONFIG_BOOT_LINK_OFFSET=0x00800000
347CONFIG_ENTRY_OFFSET=0x00001000 345CONFIG_ENTRY_OFFSET=0x00001000
348CONFIG_CMDLINE_OVERWRITE=y 346CONFIG_CMDLINE_OVERWRITE=y
349# CONFIG_CMDLINE_EXTEND is not set 347# CONFIG_CMDLINE_EXTEND is not set
350CONFIG_CMDLINE="console=tty0, console=ttySC0,115200 root=/dev/nfs ip=dhcp mem=120M memchunk.vpu=4m" 348CONFIG_CMDLINE="console=tty0, console=ttySC0,115200 root=/dev/nfs ip=dhcp mem=248M memchunk.vpu=8m memchunk.veu0=4m"
351 349
352# 350#
353# Bus options 351# Bus options
@@ -373,6 +371,7 @@ CONFIG_SUSPEND=y
373CONFIG_SUSPEND_FREEZER=y 371CONFIG_SUSPEND_FREEZER=y
374# CONFIG_HIBERNATION is not set 372# CONFIG_HIBERNATION is not set
375CONFIG_PM_RUNTIME=y 373CONFIG_PM_RUNTIME=y
374CONFIG_PM_OPS=y
376# CONFIG_CPU_IDLE is not set 375# CONFIG_CPU_IDLE is not set
377CONFIG_NET=y 376CONFIG_NET=y
378 377
@@ -380,7 +379,6 @@ CONFIG_NET=y
380# Networking options 379# Networking options
381# 380#
382CONFIG_PACKET=y 381CONFIG_PACKET=y
383# CONFIG_PACKET_MMAP is not set
384CONFIG_UNIX=y 382CONFIG_UNIX=y
385# CONFIG_NET_KEY is not set 383# CONFIG_NET_KEY is not set
386CONFIG_INET=y 384CONFIG_INET=y
@@ -445,7 +443,45 @@ CONFIG_DEFAULT_TCP_CONG="cubic"
445# CONFIG_NET_PKTGEN is not set 443# CONFIG_NET_PKTGEN is not set
446# CONFIG_HAMRADIO is not set 444# CONFIG_HAMRADIO is not set
447# CONFIG_CAN is not set 445# CONFIG_CAN is not set
448# CONFIG_IRDA is not set 446CONFIG_IRDA=y
447
448#
449# IrDA protocols
450#
451# CONFIG_IRLAN is not set
452# CONFIG_IRCOMM is not set
453# CONFIG_IRDA_ULTRA is not set
454
455#
456# IrDA options
457#
458# CONFIG_IRDA_CACHE_LAST_LSAP is not set
459# CONFIG_IRDA_FAST_RR is not set
460# CONFIG_IRDA_DEBUG is not set
461
462#
463# Infrared-port device drivers
464#
465
466#
467# SIR device drivers
468#
469# CONFIG_IRTTY_SIR is not set
470
471#
472# Dongle support
473#
474CONFIG_SH_SIR=y
475# CONFIG_KINGSUN_DONGLE is not set
476# CONFIG_KSDAZZLE_DONGLE is not set
477# CONFIG_KS959_DONGLE is not set
478
479#
480# FIR device drivers
481#
482# CONFIG_USB_IRDA is not set
483# CONFIG_SIGMATEL_FIR is not set
484# CONFIG_MCS_FIR is not set
449# CONFIG_BT is not set 485# CONFIG_BT is not set
450# CONFIG_AF_RXRPC is not set 486# CONFIG_AF_RXRPC is not set
451CONFIG_WIRELESS=y 487CONFIG_WIRELESS=y
@@ -556,6 +592,7 @@ CONFIG_MTD_NAND_IDS=y
556# CONFIG_MTD_NAND_NANDSIM is not set 592# CONFIG_MTD_NAND_NANDSIM is not set
557# CONFIG_MTD_NAND_PLATFORM is not set 593# CONFIG_MTD_NAND_PLATFORM is not set
558# CONFIG_MTD_ALAUDA is not set 594# CONFIG_MTD_ALAUDA is not set
595# CONFIG_MTD_NAND_SH_FLCTL is not set
559# CONFIG_MTD_ONENAND is not set 596# CONFIG_MTD_ONENAND is not set
560 597
561# 598#
@@ -597,6 +634,7 @@ CONFIG_MISC_DEVICES=y
597# CONFIG_ICS932S401 is not set 634# CONFIG_ICS932S401 is not set
598# CONFIG_ENCLOSURE_SERVICES is not set 635# CONFIG_ENCLOSURE_SERVICES is not set
599# CONFIG_ISL29003 is not set 636# CONFIG_ISL29003 is not set
637# CONFIG_SENSORS_TSL2550 is not set
600# CONFIG_DS1682 is not set 638# CONFIG_DS1682 is not set
601# CONFIG_TI_DAC7512 is not set 639# CONFIG_TI_DAC7512 is not set
602# CONFIG_C2PORT is not set 640# CONFIG_C2PORT is not set
@@ -616,6 +654,7 @@ CONFIG_HAVE_IDE=y
616# 654#
617# SCSI device support 655# SCSI device support
618# 656#
657CONFIG_SCSI_MOD=y
619# CONFIG_RAID_ATTRS is not set 658# CONFIG_RAID_ATTRS is not set
620CONFIG_SCSI=y 659CONFIG_SCSI=y
621CONFIG_SCSI_DMA=y 660CONFIG_SCSI_DMA=y
@@ -768,7 +807,29 @@ CONFIG_KEYBOARD_SH_KEYSC=y
768# CONFIG_INPUT_MOUSE is not set 807# CONFIG_INPUT_MOUSE is not set
769# CONFIG_INPUT_JOYSTICK is not set 808# CONFIG_INPUT_JOYSTICK is not set
770# CONFIG_INPUT_TABLET is not set 809# CONFIG_INPUT_TABLET is not set
771# CONFIG_INPUT_TOUCHSCREEN is not set 810CONFIG_INPUT_TOUCHSCREEN=y
811# CONFIG_TOUCHSCREEN_ADS7846 is not set
812# CONFIG_TOUCHSCREEN_AD7877 is not set
813# CONFIG_TOUCHSCREEN_AD7879_I2C is not set
814# CONFIG_TOUCHSCREEN_AD7879_SPI is not set
815# CONFIG_TOUCHSCREEN_AD7879 is not set
816# CONFIG_TOUCHSCREEN_DYNAPRO is not set
817# CONFIG_TOUCHSCREEN_EETI is not set
818# CONFIG_TOUCHSCREEN_FUJITSU is not set
819# CONFIG_TOUCHSCREEN_GUNZE is not set
820# CONFIG_TOUCHSCREEN_ELO is not set
821# CONFIG_TOUCHSCREEN_WACOM_W8001 is not set
822# CONFIG_TOUCHSCREEN_MCS5000 is not set
823# CONFIG_TOUCHSCREEN_MTOUCH is not set
824# CONFIG_TOUCHSCREEN_INEXIO is not set
825# CONFIG_TOUCHSCREEN_MK712 is not set
826# CONFIG_TOUCHSCREEN_PENMOUNT is not set
827# CONFIG_TOUCHSCREEN_TOUCHRIGHT is not set
828# CONFIG_TOUCHSCREEN_TOUCHWIN is not set
829# CONFIG_TOUCHSCREEN_USB_COMPOSITE is not set
830# CONFIG_TOUCHSCREEN_TOUCHIT213 is not set
831CONFIG_TOUCHSCREEN_TSC2007=y
832# CONFIG_TOUCHSCREEN_W90X900 is not set
772# CONFIG_INPUT_MISC is not set 833# CONFIG_INPUT_MISC is not set
773 834
774# 835#
@@ -802,10 +863,10 @@ CONFIG_SERIAL_SH_SCI_NR_UARTS=6
802CONFIG_SERIAL_SH_SCI_CONSOLE=y 863CONFIG_SERIAL_SH_SCI_CONSOLE=y
803CONFIG_SERIAL_CORE=y 864CONFIG_SERIAL_CORE=y
804CONFIG_SERIAL_CORE_CONSOLE=y 865CONFIG_SERIAL_CORE_CONSOLE=y
866# CONFIG_SERIAL_TIMBERDALE is not set
805CONFIG_UNIX98_PTYS=y 867CONFIG_UNIX98_PTYS=y
806# CONFIG_DEVPTS_MULTIPLE_INSTANCES is not set 868# CONFIG_DEVPTS_MULTIPLE_INSTANCES is not set
807CONFIG_LEGACY_PTYS=y 869# CONFIG_LEGACY_PTYS is not set
808CONFIG_LEGACY_PTY_COUNT=256
809# CONFIG_IPMI_HANDLER is not set 870# CONFIG_IPMI_HANDLER is not set
810CONFIG_HW_RANDOM=y 871CONFIG_HW_RANDOM=y
811# CONFIG_HW_RANDOM_TIMERIOMEM is not set 872# CONFIG_HW_RANDOM_TIMERIOMEM is not set
@@ -830,6 +891,7 @@ CONFIG_I2C_HELPER_AUTO=y
830# CONFIG_I2C_OCORES is not set 891# CONFIG_I2C_OCORES is not set
831CONFIG_I2C_SH_MOBILE=y 892CONFIG_I2C_SH_MOBILE=y
832# CONFIG_I2C_SIMTEC is not set 893# CONFIG_I2C_SIMTEC is not set
894# CONFIG_I2C_XILINX is not set
833 895
834# 896#
835# External I2C/SMBus adapter drivers 897# External I2C/SMBus adapter drivers
@@ -843,15 +905,9 @@ CONFIG_I2C_SH_MOBILE=y
843# 905#
844# CONFIG_I2C_PCA_PLATFORM is not set 906# CONFIG_I2C_PCA_PLATFORM is not set
845# CONFIG_I2C_STUB is not set 907# CONFIG_I2C_STUB is not set
846
847#
848# Miscellaneous I2C Chip support
849#
850# CONFIG_SENSORS_TSL2550 is not set
851# CONFIG_I2C_DEBUG_CORE is not set 908# CONFIG_I2C_DEBUG_CORE is not set
852# CONFIG_I2C_DEBUG_ALGO is not set 909# CONFIG_I2C_DEBUG_ALGO is not set
853# CONFIG_I2C_DEBUG_BUS is not set 910# CONFIG_I2C_DEBUG_BUS is not set
854# CONFIG_I2C_DEBUG_CHIP is not set
855CONFIG_SPI=y 911CONFIG_SPI=y
856CONFIG_SPI_MASTER=y 912CONFIG_SPI_MASTER=y
857 913
@@ -882,13 +938,16 @@ CONFIG_GPIOLIB=y
882# 938#
883# Memory mapped GPIO expanders: 939# Memory mapped GPIO expanders:
884# 940#
941# CONFIG_GPIO_IT8761E is not set
885 942
886# 943#
887# I2C GPIO expanders: 944# I2C GPIO expanders:
888# 945#
946# CONFIG_GPIO_MAX7300 is not set
889# CONFIG_GPIO_MAX732X is not set 947# CONFIG_GPIO_MAX732X is not set
890# CONFIG_GPIO_PCA953X is not set 948# CONFIG_GPIO_PCA953X is not set
891# CONFIG_GPIO_PCF857X is not set 949# CONFIG_GPIO_PCF857X is not set
950# CONFIG_GPIO_ADP5588 is not set
892 951
893# 952#
894# PCI GPIO expanders: 953# PCI GPIO expanders:
@@ -919,23 +978,26 @@ CONFIG_SSB_POSSIBLE=y
919# 978#
920# Multifunction device drivers 979# Multifunction device drivers
921# 980#
922# CONFIG_MFD_CORE is not set 981CONFIG_MFD_CORE=y
982# CONFIG_MFD_88PM860X is not set
923# CONFIG_MFD_SM501 is not set 983# CONFIG_MFD_SM501 is not set
924# CONFIG_MFD_SH_MOBILE_SDHI is not set 984CONFIG_MFD_SH_MOBILE_SDHI=y
925# CONFIG_HTC_PASIC3 is not set 985# CONFIG_HTC_PASIC3 is not set
986# CONFIG_HTC_I2CPLD is not set
926# CONFIG_TPS65010 is not set 987# CONFIG_TPS65010 is not set
927# CONFIG_TWL4030_CORE is not set 988# CONFIG_TWL4030_CORE is not set
928# CONFIG_MFD_TMIO is not set 989# CONFIG_MFD_TMIO is not set
929# CONFIG_PMIC_DA903X is not set 990# CONFIG_PMIC_DA903X is not set
930# CONFIG_PMIC_ADP5520 is not set 991# CONFIG_PMIC_ADP5520 is not set
992# CONFIG_MFD_MAX8925 is not set
931# CONFIG_MFD_WM8400 is not set 993# CONFIG_MFD_WM8400 is not set
932# CONFIG_MFD_WM831X is not set 994# CONFIG_MFD_WM831X is not set
933# CONFIG_MFD_WM8350_I2C is not set 995# CONFIG_MFD_WM8350_I2C is not set
996# CONFIG_MFD_WM8994 is not set
934# CONFIG_MFD_PCF50633 is not set 997# CONFIG_MFD_PCF50633 is not set
935# CONFIG_MFD_MC13783 is not set 998# CONFIG_MFD_MC13783 is not set
936# CONFIG_AB3100_CORE is not set 999# CONFIG_AB3100_CORE is not set
937# CONFIG_EZX_PCAP is not set 1000# CONFIG_EZX_PCAP is not set
938# CONFIG_MFD_88PM8607 is not set
939# CONFIG_AB4500_CORE is not set 1001# CONFIG_AB4500_CORE is not set
940# CONFIG_REGULATOR is not set 1002# CONFIG_REGULATOR is not set
941CONFIG_MEDIA_SUPPORT=y 1003CONFIG_MEDIA_SUPPORT=y
@@ -985,10 +1047,10 @@ CONFIG_SOC_CAMERA=y
985# CONFIG_SOC_CAMERA_MT9M001 is not set 1047# CONFIG_SOC_CAMERA_MT9M001 is not set
986# CONFIG_SOC_CAMERA_MT9M111 is not set 1048# CONFIG_SOC_CAMERA_MT9M111 is not set
987# CONFIG_SOC_CAMERA_MT9T031 is not set 1049# CONFIG_SOC_CAMERA_MT9T031 is not set
988# CONFIG_SOC_CAMERA_MT9T112 is not set 1050CONFIG_SOC_CAMERA_MT9T112=y
989# CONFIG_SOC_CAMERA_MT9V022 is not set 1051# CONFIG_SOC_CAMERA_MT9V022 is not set
990# CONFIG_SOC_CAMERA_RJ54N1 is not set 1052# CONFIG_SOC_CAMERA_RJ54N1 is not set
991# CONFIG_SOC_CAMERA_TW9910 is not set 1053CONFIG_SOC_CAMERA_TW9910=y
992# CONFIG_SOC_CAMERA_PLATFORM is not set 1054# CONFIG_SOC_CAMERA_PLATFORM is not set
993# CONFIG_SOC_CAMERA_OV772X is not set 1055# CONFIG_SOC_CAMERA_OV772X is not set
994# CONFIG_SOC_CAMERA_OV9640 is not set 1056# CONFIG_SOC_CAMERA_OV9640 is not set
@@ -1001,6 +1063,7 @@ CONFIG_RADIO_ADAPTERS=y
1001# CONFIG_RADIO_SI470X is not set 1063# CONFIG_RADIO_SI470X is not set
1002# CONFIG_USB_MR800 is not set 1064# CONFIG_USB_MR800 is not set
1003# CONFIG_RADIO_TEA5764 is not set 1065# CONFIG_RADIO_TEA5764 is not set
1066# CONFIG_RADIO_SAA7706H is not set
1004# CONFIG_RADIO_TEF6862 is not set 1067# CONFIG_RADIO_TEF6862 is not set
1005# CONFIG_DAB is not set 1068# CONFIG_DAB is not set
1006 1069
@@ -1034,6 +1097,7 @@ CONFIG_FB_DEFERRED_IO=y
1034# 1097#
1035# CONFIG_FB_S1D13XXX is not set 1098# CONFIG_FB_S1D13XXX is not set
1036CONFIG_FB_SH_MOBILE_LCDC=y 1099CONFIG_FB_SH_MOBILE_LCDC=y
1100# CONFIG_FB_TMIO is not set
1037# CONFIG_FB_VIRTUAL is not set 1101# CONFIG_FB_VIRTUAL is not set
1038# CONFIG_FB_METRONOME is not set 1102# CONFIG_FB_METRONOME is not set
1039# CONFIG_FB_MB862XX is not set 1103# CONFIG_FB_MB862XX is not set
@@ -1062,7 +1126,46 @@ CONFIG_LOGO=y
1062# CONFIG_LOGO_SUPERH_MONO is not set 1126# CONFIG_LOGO_SUPERH_MONO is not set
1063# CONFIG_LOGO_SUPERH_VGA16 is not set 1127# CONFIG_LOGO_SUPERH_VGA16 is not set
1064CONFIG_LOGO_SUPERH_CLUT224=y 1128CONFIG_LOGO_SUPERH_CLUT224=y
1065# CONFIG_SOUND is not set 1129CONFIG_SOUND=y
1130CONFIG_SOUND_OSS_CORE=y
1131CONFIG_SOUND_OSS_CORE_PRECLAIM=y
1132CONFIG_SND=y
1133CONFIG_SND_TIMER=y
1134CONFIG_SND_PCM=y
1135CONFIG_SND_JACK=y
1136CONFIG_SND_SEQUENCER=y
1137CONFIG_SND_SEQ_DUMMY=y
1138CONFIG_SND_OSSEMUL=y
1139CONFIG_SND_MIXER_OSS=y
1140CONFIG_SND_PCM_OSS=y
1141CONFIG_SND_PCM_OSS_PLUGINS=y
1142# CONFIG_SND_SEQUENCER_OSS is not set
1143# CONFIG_SND_DYNAMIC_MINORS is not set
1144CONFIG_SND_SUPPORT_OLD_API=y
1145CONFIG_SND_VERBOSE_PROCFS=y
1146# CONFIG_SND_VERBOSE_PRINTK is not set
1147# CONFIG_SND_DEBUG is not set
1148# CONFIG_SND_RAWMIDI_SEQ is not set
1149# CONFIG_SND_OPL3_LIB_SEQ is not set
1150# CONFIG_SND_OPL4_LIB_SEQ is not set
1151# CONFIG_SND_SBAWE_SEQ is not set
1152# CONFIG_SND_EMU10K1_SEQ is not set
1153# CONFIG_SND_DRIVERS is not set
1154# CONFIG_SND_SPI is not set
1155CONFIG_SND_SUPERH=y
1156# CONFIG_SND_USB is not set
1157CONFIG_SND_SOC=y
1158
1159#
1160# SoC Audio support for SuperH
1161#
1162CONFIG_SND_SOC_SH4_FSI=y
1163# CONFIG_SND_FSI_AK4642 is not set
1164CONFIG_SND_FSI_DA7210=y
1165CONFIG_SND_SOC_I2C_AND_SPI=y
1166# CONFIG_SND_SOC_ALL_CODECS is not set
1167CONFIG_SND_SOC_DA7210=y
1168# CONFIG_SOUND_PRIME is not set
1066CONFIG_HID_SUPPORT=y 1169CONFIG_HID_SUPPORT=y
1067CONFIG_HID=y 1170CONFIG_HID=y
1068# CONFIG_HIDRAW is not set 1171# CONFIG_HIDRAW is not set
@@ -1077,6 +1180,7 @@ CONFIG_USB_HID=y
1077# 1180#
1078# Special HID drivers 1181# Special HID drivers
1079# 1182#
1183# CONFIG_HID_3M_PCT is not set
1080# CONFIG_HID_A4TECH is not set 1184# CONFIG_HID_A4TECH is not set
1081# CONFIG_HID_APPLE is not set 1185# CONFIG_HID_APPLE is not set
1082# CONFIG_HID_BELKIN is not set 1186# CONFIG_HID_BELKIN is not set
@@ -1091,12 +1195,16 @@ CONFIG_USB_HID=y
1091# CONFIG_HID_KENSINGTON is not set 1195# CONFIG_HID_KENSINGTON is not set
1092# CONFIG_HID_LOGITECH is not set 1196# CONFIG_HID_LOGITECH is not set
1093# CONFIG_HID_MICROSOFT is not set 1197# CONFIG_HID_MICROSOFT is not set
1198# CONFIG_HID_MOSART is not set
1094# CONFIG_HID_MONTEREY is not set 1199# CONFIG_HID_MONTEREY is not set
1095# CONFIG_HID_NTRIG is not set 1200# CONFIG_HID_NTRIG is not set
1201# CONFIG_HID_ORTEK is not set
1096# CONFIG_HID_PANTHERLORD is not set 1202# CONFIG_HID_PANTHERLORD is not set
1097# CONFIG_HID_PETALYNX is not set 1203# CONFIG_HID_PETALYNX is not set
1204# CONFIG_HID_QUANTA is not set
1098# CONFIG_HID_SAMSUNG is not set 1205# CONFIG_HID_SAMSUNG is not set
1099# CONFIG_HID_SONY is not set 1206# CONFIG_HID_SONY is not set
1207# CONFIG_HID_STANTUM is not set
1100# CONFIG_HID_SUNPLUS is not set 1208# CONFIG_HID_SUNPLUS is not set
1101# CONFIG_HID_GREENASIA is not set 1209# CONFIG_HID_GREENASIA is not set
1102# CONFIG_HID_SMARTJOYPLUS is not set 1210# CONFIG_HID_SMARTJOYPLUS is not set
@@ -1136,6 +1244,7 @@ CONFIG_USB_MON=y
1136# CONFIG_USB_SL811_HCD is not set 1244# CONFIG_USB_SL811_HCD is not set
1137CONFIG_USB_R8A66597_HCD=y 1245CONFIG_USB_R8A66597_HCD=y
1138# CONFIG_USB_HWA_HCD is not set 1246# CONFIG_USB_HWA_HCD is not set
1247# CONFIG_USB_GADGET_MUSB_HDRC is not set
1139 1248
1140# 1249#
1141# USB Device Class drivers 1250# USB Device Class drivers
@@ -1188,7 +1297,6 @@ CONFIG_USB_STORAGE=y
1188# CONFIG_USB_RIO500 is not set 1297# CONFIG_USB_RIO500 is not set
1189# CONFIG_USB_LEGOTOWER is not set 1298# CONFIG_USB_LEGOTOWER is not set
1190# CONFIG_USB_LCD is not set 1299# CONFIG_USB_LCD is not set
1191# CONFIG_USB_BERRY_CHARGE is not set
1192# CONFIG_USB_LED is not set 1300# CONFIG_USB_LED is not set
1193# CONFIG_USB_CYPRESS_CY7C63 is not set 1301# CONFIG_USB_CYPRESS_CY7C63 is not set
1194# CONFIG_USB_CYTHERM is not set 1302# CONFIG_USB_CYTHERM is not set
@@ -1200,8 +1308,45 @@ CONFIG_USB_STORAGE=y
1200# CONFIG_USB_IOWARRIOR is not set 1308# CONFIG_USB_IOWARRIOR is not set
1201# CONFIG_USB_TEST is not set 1309# CONFIG_USB_TEST is not set
1202# CONFIG_USB_ISIGHTFW is not set 1310# CONFIG_USB_ISIGHTFW is not set
1203# CONFIG_USB_VST is not set 1311CONFIG_USB_GADGET=y
1204# CONFIG_USB_GADGET is not set 1312# CONFIG_USB_GADGET_DEBUG_FILES is not set
1313# CONFIG_USB_GADGET_DEBUG_FS is not set
1314CONFIG_USB_GADGET_VBUS_DRAW=2
1315CONFIG_USB_GADGET_SELECTED=y
1316# CONFIG_USB_GADGET_AT91 is not set
1317# CONFIG_USB_GADGET_ATMEL_USBA is not set
1318# CONFIG_USB_GADGET_FSL_USB2 is not set
1319# CONFIG_USB_GADGET_LH7A40X is not set
1320# CONFIG_USB_GADGET_OMAP is not set
1321# CONFIG_USB_GADGET_PXA25X is not set
1322CONFIG_USB_GADGET_R8A66597=y
1323CONFIG_USB_R8A66597=y
1324# CONFIG_USB_GADGET_PXA27X is not set
1325# CONFIG_USB_GADGET_S3C_HSOTG is not set
1326# CONFIG_USB_GADGET_IMX is not set
1327# CONFIG_USB_GADGET_S3C2410 is not set
1328# CONFIG_USB_GADGET_M66592 is not set
1329# CONFIG_USB_GADGET_AMD5536UDC is not set
1330# CONFIG_USB_GADGET_FSL_QE is not set
1331# CONFIG_USB_GADGET_CI13XXX is not set
1332# CONFIG_USB_GADGET_NET2280 is not set
1333# CONFIG_USB_GADGET_GOKU is not set
1334# CONFIG_USB_GADGET_LANGWELL is not set
1335# CONFIG_USB_GADGET_DUMMY_HCD is not set
1336CONFIG_USB_GADGET_DUALSPEED=y
1337# CONFIG_USB_ZERO is not set
1338# CONFIG_USB_AUDIO is not set
1339# CONFIG_USB_ETH is not set
1340# CONFIG_USB_GADGETFS is not set
1341CONFIG_USB_FILE_STORAGE=m
1342# CONFIG_USB_FILE_STORAGE_TEST is not set
1343# CONFIG_USB_MASS_STORAGE is not set
1344# CONFIG_USB_G_SERIAL is not set
1345# CONFIG_USB_MIDI_GADGET is not set
1346# CONFIG_USB_G_PRINTER is not set
1347# CONFIG_USB_CDC_COMPOSITE is not set
1348# CONFIG_USB_G_NOKIA is not set
1349# CONFIG_USB_G_MULTI is not set
1205 1350
1206# 1351#
1207# OTG and related infrastructure 1352# OTG and related infrastructure
@@ -1224,10 +1369,8 @@ CONFIG_MMC_BLOCK_BOUNCE=y
1224# MMC/SD/SDIO Host Controller Drivers 1369# MMC/SD/SDIO Host Controller Drivers
1225# 1370#
1226# CONFIG_MMC_SDHCI is not set 1371# CONFIG_MMC_SDHCI is not set
1227# CONFIG_MMC_AT91 is not set
1228# CONFIG_MMC_ATMELMCI is not set
1229CONFIG_MMC_SPI=y 1372CONFIG_MMC_SPI=y
1230# CONFIG_MMC_TMIO is not set 1373CONFIG_MMC_TMIO=y
1231# CONFIG_MEMSTICK is not set 1374# CONFIG_MEMSTICK is not set
1232# CONFIG_NEW_LEDS is not set 1375# CONFIG_NEW_LEDS is not set
1233# CONFIG_ACCESSIBILITY is not set 1376# CONFIG_ACCESSIBILITY is not set
@@ -1253,10 +1396,10 @@ CONFIG_RTC_INTF_DEV=y
1253# CONFIG_RTC_DRV_DS1374 is not set 1396# CONFIG_RTC_DRV_DS1374 is not set
1254# CONFIG_RTC_DRV_DS1672 is not set 1397# CONFIG_RTC_DRV_DS1672 is not set
1255# CONFIG_RTC_DRV_MAX6900 is not set 1398# CONFIG_RTC_DRV_MAX6900 is not set
1256# CONFIG_RTC_DRV_RS5C372 is not set 1399CONFIG_RTC_DRV_RS5C372=y
1257# CONFIG_RTC_DRV_ISL1208 is not set 1400# CONFIG_RTC_DRV_ISL1208 is not set
1258# CONFIG_RTC_DRV_X1205 is not set 1401# CONFIG_RTC_DRV_X1205 is not set
1259CONFIG_RTC_DRV_PCF8563=y 1402# CONFIG_RTC_DRV_PCF8563 is not set
1260# CONFIG_RTC_DRV_PCF8583 is not set 1403# CONFIG_RTC_DRV_PCF8583 is not set
1261# CONFIG_RTC_DRV_M41T80 is not set 1404# CONFIG_RTC_DRV_M41T80 is not set
1262# CONFIG_RTC_DRV_BQ32K is not set 1405# CONFIG_RTC_DRV_BQ32K is not set
@@ -1303,8 +1446,6 @@ CONFIG_RTC_DRV_PCF8563=y
1303CONFIG_UIO=y 1446CONFIG_UIO=y
1304# CONFIG_UIO_PDRV is not set 1447# CONFIG_UIO_PDRV is not set
1305CONFIG_UIO_PDRV_GENIRQ=y 1448CONFIG_UIO_PDRV_GENIRQ=y
1306# CONFIG_UIO_SMX is not set
1307# CONFIG_UIO_SERCOS3 is not set
1308 1449
1309# 1450#
1310# TI VLYNQ 1451# TI VLYNQ
@@ -1390,6 +1531,7 @@ CONFIG_MISC_FILESYSTEMS=y
1390# CONFIG_EFS_FS is not set 1531# CONFIG_EFS_FS is not set
1391# CONFIG_JFFS2_FS is not set 1532# CONFIG_JFFS2_FS is not set
1392# CONFIG_UBIFS_FS is not set 1533# CONFIG_UBIFS_FS is not set
1534# CONFIG_LOGFS is not set
1393# CONFIG_CRAMFS is not set 1535# CONFIG_CRAMFS is not set
1394# CONFIG_SQUASHFS is not set 1536# CONFIG_SQUASHFS is not set
1395# CONFIG_VXFS_FS is not set 1537# CONFIG_VXFS_FS is not set
@@ -1418,6 +1560,7 @@ CONFIG_SUNRPC=y
1418# CONFIG_RPCSEC_GSS_KRB5 is not set 1560# CONFIG_RPCSEC_GSS_KRB5 is not set
1419# CONFIG_RPCSEC_GSS_SPKM3 is not set 1561# CONFIG_RPCSEC_GSS_SPKM3 is not set
1420# CONFIG_SMB_FS is not set 1562# CONFIG_SMB_FS is not set
1563# CONFIG_CEPH_FS is not set
1421# CONFIG_CIFS is not set 1564# CONFIG_CIFS is not set
1422# CONFIG_NCP_FS is not set 1565# CONFIG_NCP_FS is not set
1423# CONFIG_CODA_FS is not set 1566# CONFIG_CODA_FS is not set
@@ -1487,6 +1630,7 @@ CONFIG_DEBUG_FS=y
1487CONFIG_DEBUG_BUGVERBOSE=y 1630CONFIG_DEBUG_BUGVERBOSE=y
1488# CONFIG_DEBUG_MEMORY_INIT is not set 1631# CONFIG_DEBUG_MEMORY_INIT is not set
1489# CONFIG_RCU_CPU_STALL_DETECTOR is not set 1632# CONFIG_RCU_CPU_STALL_DETECTOR is not set
1633# CONFIG_LKDTM is not set
1490# CONFIG_LATENCYTOP is not set 1634# CONFIG_LATENCYTOP is not set
1491CONFIG_SYSCTL_SYSCALL_CHECK=y 1635CONFIG_SYSCTL_SYSCALL_CHECK=y
1492CONFIG_HAVE_FUNCTION_TRACER=y 1636CONFIG_HAVE_FUNCTION_TRACER=y
@@ -1618,7 +1762,7 @@ CONFIG_CRYPTO_HW=y
1618# 1762#
1619CONFIG_BITREVERSE=y 1763CONFIG_BITREVERSE=y
1620CONFIG_GENERIC_FIND_LAST_BIT=y 1764CONFIG_GENERIC_FIND_LAST_BIT=y
1621# CONFIG_CRC_CCITT is not set 1765CONFIG_CRC_CCITT=y
1622# CONFIG_CRC16 is not set 1766# CONFIG_CRC16 is not set
1623CONFIG_CRC_T10DIF=y 1767CONFIG_CRC_T10DIF=y
1624CONFIG_CRC_ITU_T=y 1768CONFIG_CRC_ITU_T=y
diff --git a/arch/sh/include/asm/clkdev.h b/arch/sh/include/asm/clkdev.h
new file mode 100644
index 000000000000..5645f358128b
--- /dev/null
+++ b/arch/sh/include/asm/clkdev.h
@@ -0,0 +1,35 @@
1/*
2 * arch/sh/include/asm/clkdev.h
3 *
4 * Cloned from arch/arm/include/asm/clkdev.h:
5 *
6 * Copyright (C) 2008 Russell King.
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 *
12 * Helper for the clk API to assist looking up a struct clk.
13 */
14#ifndef __ASM_CLKDEV_H
15#define __ASM_CLKDEV_H
16
17struct clk;
18
19struct clk_lookup {
20 struct list_head node;
21 const char *dev_id;
22 const char *con_id;
23 struct clk *clk;
24};
25
26struct clk_lookup *clkdev_alloc(struct clk *clk, const char *con_id,
27 const char *dev_fmt, ...);
28
29void clkdev_add(struct clk_lookup *cl);
30void clkdev_drop(struct clk_lookup *cl);
31
32void clkdev_add_table(struct clk_lookup *, size_t);
33int clk_add_alias(const char *, const char *, char *, struct device *);
34
35#endif
diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h
index 11da4c5beb68..4b19179230fe 100644
--- a/arch/sh/include/asm/clock.h
+++ b/arch/sh/include/asm/clock.h
@@ -45,13 +45,6 @@ struct clk {
45 struct cpufreq_frequency_table *freq_table; 45 struct cpufreq_frequency_table *freq_table;
46}; 46};
47 47
48struct clk_lookup {
49 struct list_head node;
50 const char *dev_id;
51 const char *con_id;
52 struct clk *clk;
53};
54
55#define CLK_ENABLE_ON_INIT (1 << 0) 48#define CLK_ENABLE_ON_INIT (1 << 0)
56 49
57/* Should be defined by processor-specific code */ 50/* Should be defined by processor-specific code */
diff --git a/arch/sh/include/asm/dmaengine.h b/arch/sh/include/asm/dmaengine.h
index bf2f30cf0a27..2a02b611a9ad 100644
--- a/arch/sh/include/asm/dmaengine.h
+++ b/arch/sh/include/asm/dmaengine.h
@@ -10,14 +10,9 @@
10#ifndef ASM_DMAENGINE_H 10#ifndef ASM_DMAENGINE_H
11#define ASM_DMAENGINE_H 11#define ASM_DMAENGINE_H
12 12
13#include <linux/dmaengine.h> 13#include <linux/sh_dma.h>
14#include <linux/list.h>
15 14
16#include <asm/dma-register.h> 15enum {
17
18#define SH_DMAC_MAX_CHANNELS 6
19
20enum sh_dmae_slave_chan_id {
21 SHDMA_SLAVE_SCIF0_TX, 16 SHDMA_SLAVE_SCIF0_TX,
22 SHDMA_SLAVE_SCIF0_RX, 17 SHDMA_SLAVE_SCIF0_RX,
23 SHDMA_SLAVE_SCIF1_TX, 18 SHDMA_SLAVE_SCIF1_TX,
@@ -34,60 +29,6 @@ enum sh_dmae_slave_chan_id {
34 SHDMA_SLAVE_SIUA_RX, 29 SHDMA_SLAVE_SIUA_RX,
35 SHDMA_SLAVE_SIUB_TX, 30 SHDMA_SLAVE_SIUB_TX,
36 SHDMA_SLAVE_SIUB_RX, 31 SHDMA_SLAVE_SIUB_RX,
37 SHDMA_SLAVE_NUMBER, /* Must stay last */
38};
39
40struct sh_dmae_slave_config {
41 enum sh_dmae_slave_chan_id slave_id;
42 dma_addr_t addr;
43 u32 chcr;
44 char mid_rid;
45};
46
47struct sh_dmae_channel {
48 unsigned int offset;
49 unsigned int dmars;
50 unsigned int dmars_bit;
51};
52
53struct sh_dmae_pdata {
54 struct sh_dmae_slave_config *slave;
55 int slave_num;
56 struct sh_dmae_channel *channel;
57 int channel_num;
58 unsigned int ts_low_shift;
59 unsigned int ts_low_mask;
60 unsigned int ts_high_shift;
61 unsigned int ts_high_mask;
62 unsigned int *ts_shift;
63 int ts_shift_num;
64 u16 dmaor_init;
65};
66
67struct device;
68
69/* Used by slave DMA clients to request DMA to/from a specific peripheral */
70struct sh_dmae_slave {
71 enum sh_dmae_slave_chan_id slave_id; /* Set by the platform */
72 struct device *dma_dev; /* Set by the platform */
73 struct sh_dmae_slave_config *config; /* Set by the driver */
74};
75
76struct sh_dmae_regs {
77 u32 sar; /* SAR / source address */
78 u32 dar; /* DAR / destination address */
79 u32 tcr; /* TCR / transfer count */
80};
81
82struct sh_desc {
83 struct sh_dmae_regs hw;
84 struct list_head node;
85 struct dma_async_tx_descriptor async_tx;
86 enum dma_data_direction direction;
87 dma_cookie_t cookie;
88 size_t partial;
89 int chunks;
90 int mark;
91}; 32};
92 33
93#endif 34#endif
diff --git a/arch/sh/include/asm/elf.h b/arch/sh/include/asm/elf.h
index ac04255022b6..ce830faeebbf 100644
--- a/arch/sh/include/asm/elf.h
+++ b/arch/sh/include/asm/elf.h
@@ -211,7 +211,9 @@ extern void __kernel_vsyscall;
211 211
212#define VSYSCALL_AUX_ENT \ 212#define VSYSCALL_AUX_ENT \
213 if (vdso_enabled) \ 213 if (vdso_enabled) \
214 NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_BASE); 214 NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_BASE); \
215 else \
216 NEW_AUX_ENT(AT_IGNORE, 0);
215#else 217#else
216#define VSYSCALL_AUX_ENT 218#define VSYSCALL_AUX_ENT
217#endif /* CONFIG_VSYSCALL */ 219#endif /* CONFIG_VSYSCALL */
@@ -219,7 +221,7 @@ extern void __kernel_vsyscall;
219#ifdef CONFIG_SH_FPU 221#ifdef CONFIG_SH_FPU
220#define FPU_AUX_ENT NEW_AUX_ENT(AT_FPUCW, FPSCR_INIT) 222#define FPU_AUX_ENT NEW_AUX_ENT(AT_FPUCW, FPSCR_INIT)
221#else 223#else
222#define FPU_AUX_ENT 224#define FPU_AUX_ENT NEW_AUX_ENT(AT_IGNORE, 0)
223#endif 225#endif
224 226
225extern int l1i_cache_shape, l1d_cache_shape, l2_cache_shape; 227extern int l1i_cache_shape, l1d_cache_shape, l2_cache_shape;
diff --git a/arch/sh/include/asm/mmu.h b/arch/sh/include/asm/mmu.h
index 19fe84550b49..56e4418c19b9 100644
--- a/arch/sh/include/asm/mmu.h
+++ b/arch/sh/include/asm/mmu.h
@@ -66,6 +66,13 @@ int pmb_unmap(void __iomem *addr);
66 66
67#else 67#else
68 68
69static inline int
70pmb_bolt_mapping(unsigned long virt, phys_addr_t phys,
71 unsigned long size, pgprot_t prot)
72{
73 return -EINVAL;
74}
75
69static inline void __iomem * 76static inline void __iomem *
70pmb_remap_caller(phys_addr_t phys, unsigned long size, 77pmb_remap_caller(phys_addr_t phys, unsigned long size,
71 pgprot_t prot, void *caller) 78 pgprot_t prot, void *caller)
diff --git a/arch/sh/include/asm/siu.h b/arch/sh/include/asm/siu.h
index f1b1e6944a5f..e8d4142baf59 100644
--- a/arch/sh/include/asm/siu.h
+++ b/arch/sh/include/asm/siu.h
@@ -17,10 +17,10 @@ struct device;
17 17
18struct siu_platform { 18struct siu_platform {
19 struct device *dma_dev; 19 struct device *dma_dev;
20 enum sh_dmae_slave_chan_id dma_slave_tx_a; 20 unsigned int dma_slave_tx_a;
21 enum sh_dmae_slave_chan_id dma_slave_rx_a; 21 unsigned int dma_slave_rx_a;
22 enum sh_dmae_slave_chan_id dma_slave_tx_b; 22 unsigned int dma_slave_tx_b;
23 enum sh_dmae_slave_chan_id dma_slave_rx_b; 23 unsigned int dma_slave_rx_b;
24}; 24};
25 25
26#endif /* ASM_SIU_H */ 26#endif /* ASM_SIU_H */
diff --git a/arch/sh/include/cpu-sh4/cpu/mmu_context.h b/arch/sh/include/cpu-sh4/cpu/mmu_context.h
index 03ea75c5315d..2941be617a5f 100644
--- a/arch/sh/include/cpu-sh4/cpu/mmu_context.h
+++ b/arch/sh/include/cpu-sh4/cpu/mmu_context.h
@@ -19,8 +19,17 @@
19 19
20#define MMUCR 0xFF000010 /* MMU Control Register */ 20#define MMUCR 0xFF000010 /* MMU Control Register */
21 21
22#define MMU_TLB_ENTRY_SHIFT 8
23
24#define MMU_ITLB_ADDRESS_ARRAY 0xF2000000
25#define MMU_ITLB_ADDRESS_ARRAY2 0xF2800000
26#define MMU_ITLB_DATA_ARRAY 0xF3000000
27#define MMU_ITLB_DATA_ARRAY2 0xF3800000
28
22#define MMU_UTLB_ADDRESS_ARRAY 0xF6000000 29#define MMU_UTLB_ADDRESS_ARRAY 0xF6000000
23#define MMU_UTLB_ADDRESS_ARRAY2 0xF6800000 30#define MMU_UTLB_ADDRESS_ARRAY2 0xF6800000
31#define MMU_UTLB_DATA_ARRAY 0xF7000000
32#define MMU_UTLB_DATA_ARRAY2 0xF7800000
24#define MMU_PAGE_ASSOC_BIT 0x80 33#define MMU_PAGE_ASSOC_BIT 0x80
25 34
26#define MMUCR_TI (1<<2) 35#define MMUCR_TI (1<<2)
@@ -28,6 +37,8 @@
28#define MMUCR_URB 0x00FC0000 37#define MMUCR_URB 0x00FC0000
29#define MMUCR_URB_SHIFT 18 38#define MMUCR_URB_SHIFT 18
30#define MMUCR_URB_NENTRIES 64 39#define MMUCR_URB_NENTRIES 64
40#define MMUCR_URC 0x0000FC00
41#define MMUCR_URC_SHIFT 10
31 42
32#if defined(CONFIG_32BIT) && defined(CONFIG_CPU_SUBTYPE_ST40) 43#if defined(CONFIG_32BIT) && defined(CONFIG_CPU_SUBTYPE_ST40)
33#define MMUCR_SE (1 << 4) 44#define MMUCR_SE (1 << 4)
diff --git a/arch/sh/include/cpu-sh4/cpu/watchdog.h b/arch/sh/include/cpu-sh4/cpu/watchdog.h
index 7672301d0c70..7f62b9380938 100644
--- a/arch/sh/include/cpu-sh4/cpu/watchdog.h
+++ b/arch/sh/include/cpu-sh4/cpu/watchdog.h
@@ -21,6 +21,12 @@
21#define WTCNT 0xffcc0000 /*WDTST*/ 21#define WTCNT 0xffcc0000 /*WDTST*/
22#define WTST WTCNT 22#define WTST WTCNT
23#define WTBST 0xffcc0008 /*WDTBST*/ 23#define WTBST 0xffcc0008 /*WDTBST*/
24/* Register definitions */
25#elif defined(CONFIG_CPU_SUBTYPE_SH7722) || \
26 defined(CONFIG_CPU_SUBTYPE_SH7723) || \
27 defined(CONFIG_CPU_SUBTYPE_SH7724)
28#define WTCNT 0xa4520000
29#define WTCSR 0xa4520004
24#else 30#else
25/* Register definitions */ 31/* Register definitions */
26#define WTCNT 0xffc00008 32#define WTCNT 0xffc00008
diff --git a/arch/sh/kernel/Makefile b/arch/sh/kernel/Makefile
index 02fd3ae8b0ee..650b92f00ee5 100644
--- a/arch/sh/kernel/Makefile
+++ b/arch/sh/kernel/Makefile
@@ -11,7 +11,7 @@ endif
11 11
12CFLAGS_REMOVE_return_address.o = -pg 12CFLAGS_REMOVE_return_address.o = -pg
13 13
14obj-y := debugtraps.o dma-nommu.o dumpstack.o \ 14obj-y := clkdev.o debugtraps.o dma-nommu.o dumpstack.o \
15 idle.o io.o io_generic.o irq.o \ 15 idle.o io.o io_generic.o irq.o \
16 irq_$(BITS).o machvec.o nmi_debug.o process.o \ 16 irq_$(BITS).o machvec.o nmi_debug.o process.o \
17 process_$(BITS).o ptrace_$(BITS).o \ 17 process_$(BITS).o ptrace_$(BITS).o \
diff --git a/arch/sh/kernel/clkdev.c b/arch/sh/kernel/clkdev.c
new file mode 100644
index 000000000000..defdd6e30908
--- /dev/null
+++ b/arch/sh/kernel/clkdev.c
@@ -0,0 +1,169 @@
1/*
2 * arch/sh/kernel/clkdev.c
3 *
4 * Cloned from arch/arm/common/clkdev.c:
5 *
6 * Copyright (C) 2008 Russell King.
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 *
12 * Helper for the clk API to assist looking up a struct clk.
13 */
14#include <linux/module.h>
15#include <linux/kernel.h>
16#include <linux/device.h>
17#include <linux/list.h>
18#include <linux/errno.h>
19#include <linux/err.h>
20#include <linux/string.h>
21#include <linux/mutex.h>
22#include <linux/clk.h>
23#include <linux/slab.h>
24#include <linux/bootmem.h>
25#include <linux/mm.h>
26#include <asm/clock.h>
27#include <asm/clkdev.h>
28
29static LIST_HEAD(clocks);
30static DEFINE_MUTEX(clocks_mutex);
31
32/*
33 * Find the correct struct clk for the device and connection ID.
34 * We do slightly fuzzy matching here:
35 * An entry with a NULL ID is assumed to be a wildcard.
36 * If an entry has a device ID, it must match
37 * If an entry has a connection ID, it must match
38 * Then we take the most specific entry - with the following
39 * order of precidence: dev+con > dev only > con only.
40 */
41static struct clk *clk_find(const char *dev_id, const char *con_id)
42{
43 struct clk_lookup *p;
44 struct clk *clk = NULL;
45 int match, best = 0;
46
47 list_for_each_entry(p, &clocks, node) {
48 match = 0;
49 if (p->dev_id) {
50 if (!dev_id || strcmp(p->dev_id, dev_id))
51 continue;
52 match += 2;
53 }
54 if (p->con_id) {
55 if (!con_id || strcmp(p->con_id, con_id))
56 continue;
57 match += 1;
58 }
59 if (match == 0)
60 continue;
61
62 if (match > best) {
63 clk = p->clk;
64 best = match;
65 }
66 }
67 return clk;
68}
69
70struct clk *clk_get_sys(const char *dev_id, const char *con_id)
71{
72 struct clk *clk;
73
74 mutex_lock(&clocks_mutex);
75 clk = clk_find(dev_id, con_id);
76 mutex_unlock(&clocks_mutex);
77
78 return clk ? clk : ERR_PTR(-ENOENT);
79}
80EXPORT_SYMBOL(clk_get_sys);
81
82void clkdev_add(struct clk_lookup *cl)
83{
84 mutex_lock(&clocks_mutex);
85 list_add_tail(&cl->node, &clocks);
86 mutex_unlock(&clocks_mutex);
87}
88EXPORT_SYMBOL(clkdev_add);
89
90void __init clkdev_add_table(struct clk_lookup *cl, size_t num)
91{
92 mutex_lock(&clocks_mutex);
93 while (num--) {
94 list_add_tail(&cl->node, &clocks);
95 cl++;
96 }
97 mutex_unlock(&clocks_mutex);
98}
99
100#define MAX_DEV_ID 20
101#define MAX_CON_ID 16
102
103struct clk_lookup_alloc {
104 struct clk_lookup cl;
105 char dev_id[MAX_DEV_ID];
106 char con_id[MAX_CON_ID];
107};
108
109struct clk_lookup * __init_refok
110clkdev_alloc(struct clk *clk, const char *con_id, const char *dev_fmt, ...)
111{
112 struct clk_lookup_alloc *cla;
113
114 if (!slab_is_available())
115 cla = alloc_bootmem_low_pages(sizeof(*cla));
116 else
117 cla = kzalloc(sizeof(*cla), GFP_KERNEL);
118
119 if (!cla)
120 return NULL;
121
122 cla->cl.clk = clk;
123 if (con_id) {
124 strlcpy(cla->con_id, con_id, sizeof(cla->con_id));
125 cla->cl.con_id = cla->con_id;
126 }
127
128 if (dev_fmt) {
129 va_list ap;
130
131 va_start(ap, dev_fmt);
132 vscnprintf(cla->dev_id, sizeof(cla->dev_id), dev_fmt, ap);
133 cla->cl.dev_id = cla->dev_id;
134 va_end(ap);
135 }
136
137 return &cla->cl;
138}
139EXPORT_SYMBOL(clkdev_alloc);
140
141int clk_add_alias(const char *alias, const char *alias_dev_name, char *id,
142 struct device *dev)
143{
144 struct clk *r = clk_get(dev, id);
145 struct clk_lookup *l;
146
147 if (IS_ERR(r))
148 return PTR_ERR(r);
149
150 l = clkdev_alloc(r, alias, alias_dev_name);
151 clk_put(r);
152 if (!l)
153 return -ENODEV;
154 clkdev_add(l);
155 return 0;
156}
157EXPORT_SYMBOL(clk_add_alias);
158
159/*
160 * clkdev_drop - remove a clock dynamically allocated
161 */
162void clkdev_drop(struct clk_lookup *cl)
163{
164 mutex_lock(&clocks_mutex);
165 list_del(&cl->node);
166 mutex_unlock(&clocks_mutex);
167 kfree(cl);
168}
169EXPORT_SYMBOL(clkdev_drop);
diff --git a/arch/sh/kernel/cpu/clock-cpg.c b/arch/sh/kernel/cpu/clock-cpg.c
index eed5eaff96ba..17a73ad7a20d 100644
--- a/arch/sh/kernel/cpu/clock-cpg.c
+++ b/arch/sh/kernel/cpu/clock-cpg.c
@@ -338,6 +338,11 @@ int __init __deprecated cpg_clk_init(void)
338 ret |= clk_register(clk); 338 ret |= clk_register(clk);
339 } 339 }
340 340
341 clk_add_alias("tmu_fck", NULL, "peripheral_clk", NULL);
342 clk_add_alias("mtu2_fck", NULL, "peripheral_clk", NULL);
343 clk_add_alias("cmt_fck", NULL, "peripheral_clk", NULL);
344 clk_add_alias("sci_ick", NULL, "peripheral_clk", NULL);
345
341 return ret; 346 return ret;
342} 347}
343 348
diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c
index e9fa1bfed53e..9ded1bc29260 100644
--- a/arch/sh/kernel/cpu/clock.c
+++ b/arch/sh/kernel/cpu/clock.c
@@ -10,10 +10,6 @@
10 * 10 *
11 * Modified for omap shared clock framework by Tony Lindgren <tony@atomide.com> 11 * Modified for omap shared clock framework by Tony Lindgren <tony@atomide.com>
12 * 12 *
13 * With clkdev bits:
14 *
15 * Copyright (C) 2008 Russell King.
16 *
17 * This file is subject to the terms and conditions of the GNU General Public 13 * This file is subject to the terms and conditions of the GNU General Public
18 * License. See the file "COPYING" in the main directory of this archive 14 * License. See the file "COPYING" in the main directory of this archive
19 * for more details. 15 * for more details.
@@ -30,6 +26,7 @@
30#include <linux/platform_device.h> 26#include <linux/platform_device.h>
31#include <linux/debugfs.h> 27#include <linux/debugfs.h>
32#include <linux/cpufreq.h> 28#include <linux/cpufreq.h>
29#include <linux/clk.h>
33#include <asm/clock.h> 30#include <asm/clock.h>
34#include <asm/machvec.h> 31#include <asm/machvec.h>
35 32
@@ -398,56 +395,6 @@ long clk_round_rate(struct clk *clk, unsigned long rate)
398EXPORT_SYMBOL_GPL(clk_round_rate); 395EXPORT_SYMBOL_GPL(clk_round_rate);
399 396
400/* 397/*
401 * Find the correct struct clk for the device and connection ID.
402 * We do slightly fuzzy matching here:
403 * An entry with a NULL ID is assumed to be a wildcard.
404 * If an entry has a device ID, it must match
405 * If an entry has a connection ID, it must match
406 * Then we take the most specific entry - with the following
407 * order of precedence: dev+con > dev only > con only.
408 */
409static struct clk *clk_find(const char *dev_id, const char *con_id)
410{
411 struct clk_lookup *p;
412 struct clk *clk = NULL;
413 int match, best = 0;
414
415 list_for_each_entry(p, &clock_list, node) {
416 match = 0;
417 if (p->dev_id) {
418 if (!dev_id || strcmp(p->dev_id, dev_id))
419 continue;
420 match += 2;
421 }
422 if (p->con_id) {
423 if (!con_id || strcmp(p->con_id, con_id))
424 continue;
425 match += 1;
426 }
427 if (match == 0)
428 continue;
429
430 if (match > best) {
431 clk = p->clk;
432 best = match;
433 }
434 }
435 return clk;
436}
437
438struct clk *clk_get_sys(const char *dev_id, const char *con_id)
439{
440 struct clk *clk;
441
442 mutex_lock(&clock_list_sem);
443 clk = clk_find(dev_id, con_id);
444 mutex_unlock(&clock_list_sem);
445
446 return clk ? clk : ERR_PTR(-ENOENT);
447}
448EXPORT_SYMBOL_GPL(clk_get_sys);
449
450/*
451 * Returns a clock. Note that we first try to use device id on the bus 398 * Returns a clock. Note that we first try to use device id on the bus
452 * and clock name. If this fails, we try to use clock name only. 399 * and clock name. If this fails, we try to use clock name only.
453 */ 400 */
diff --git a/arch/sh/kernel/cpu/sh2/setup-sh7619.c b/arch/sh/kernel/cpu/sh2/setup-sh7619.c
index 114c7cee7184..c3638516bffc 100644
--- a/arch/sh/kernel/cpu/sh2/setup-sh7619.c
+++ b/arch/sh/kernel/cpu/sh2/setup-sh7619.c
@@ -128,17 +128,14 @@ static struct platform_device eth_device = {
128}; 128};
129 129
130static struct sh_timer_config cmt0_platform_data = { 130static struct sh_timer_config cmt0_platform_data = {
131 .name = "CMT0",
132 .channel_offset = 0x02, 131 .channel_offset = 0x02,
133 .timer_bit = 0, 132 .timer_bit = 0,
134 .clk = "peripheral_clk",
135 .clockevent_rating = 125, 133 .clockevent_rating = 125,
136 .clocksource_rating = 0, /* disabled due to code generation issues */ 134 .clocksource_rating = 0, /* disabled due to code generation issues */
137}; 135};
138 136
139static struct resource cmt0_resources[] = { 137static struct resource cmt0_resources[] = {
140 [0] = { 138 [0] = {
141 .name = "CMT0",
142 .start = 0xf84a0072, 139 .start = 0xf84a0072,
143 .end = 0xf84a0077, 140 .end = 0xf84a0077,
144 .flags = IORESOURCE_MEM, 141 .flags = IORESOURCE_MEM,
@@ -160,17 +157,14 @@ static struct platform_device cmt0_device = {
160}; 157};
161 158
162static struct sh_timer_config cmt1_platform_data = { 159static struct sh_timer_config cmt1_platform_data = {
163 .name = "CMT1",
164 .channel_offset = 0x08, 160 .channel_offset = 0x08,
165 .timer_bit = 1, 161 .timer_bit = 1,
166 .clk = "peripheral_clk",
167 .clockevent_rating = 125, 162 .clockevent_rating = 125,
168 .clocksource_rating = 0, /* disabled due to code generation issues */ 163 .clocksource_rating = 0, /* disabled due to code generation issues */
169}; 164};
170 165
171static struct resource cmt1_resources[] = { 166static struct resource cmt1_resources[] = {
172 [0] = { 167 [0] = {
173 .name = "CMT1",
174 .start = 0xf84a0078, 168 .start = 0xf84a0078,
175 .end = 0xf84a007d, 169 .end = 0xf84a007d,
176 .flags = IORESOURCE_MEM, 170 .flags = IORESOURCE_MEM,
diff --git a/arch/sh/kernel/cpu/sh2a/setup-mxg.c b/arch/sh/kernel/cpu/sh2a/setup-mxg.c
index 8f669dc9b0da..6c96ea02bf8d 100644
--- a/arch/sh/kernel/cpu/sh2a/setup-mxg.c
+++ b/arch/sh/kernel/cpu/sh2a/setup-mxg.c
@@ -115,16 +115,13 @@ static DECLARE_INTC_DESC(intc_desc, "mxg", vectors, groups,
115 mask_registers, prio_registers, NULL); 115 mask_registers, prio_registers, NULL);
116 116
117static struct sh_timer_config mtu2_0_platform_data = { 117static struct sh_timer_config mtu2_0_platform_data = {
118 .name = "MTU2_0",
119 .channel_offset = -0x80, 118 .channel_offset = -0x80,
120 .timer_bit = 0, 119 .timer_bit = 0,
121 .clk = "peripheral_clk",
122 .clockevent_rating = 200, 120 .clockevent_rating = 200,
123}; 121};
124 122
125static struct resource mtu2_0_resources[] = { 123static struct resource mtu2_0_resources[] = {
126 [0] = { 124 [0] = {
127 .name = "MTU2_0",
128 .start = 0xff801300, 125 .start = 0xff801300,
129 .end = 0xff801326, 126 .end = 0xff801326,
130 .flags = IORESOURCE_MEM, 127 .flags = IORESOURCE_MEM,
@@ -146,16 +143,13 @@ static struct platform_device mtu2_0_device = {
146}; 143};
147 144
148static struct sh_timer_config mtu2_1_platform_data = { 145static struct sh_timer_config mtu2_1_platform_data = {
149 .name = "MTU2_1",
150 .channel_offset = -0x100, 146 .channel_offset = -0x100,
151 .timer_bit = 1, 147 .timer_bit = 1,
152 .clk = "peripheral_clk",
153 .clockevent_rating = 200, 148 .clockevent_rating = 200,
154}; 149};
155 150
156static struct resource mtu2_1_resources[] = { 151static struct resource mtu2_1_resources[] = {
157 [0] = { 152 [0] = {
158 .name = "MTU2_1",
159 .start = 0xff801380, 153 .start = 0xff801380,
160 .end = 0xff801390, 154 .end = 0xff801390,
161 .flags = IORESOURCE_MEM, 155 .flags = IORESOURCE_MEM,
@@ -177,16 +171,13 @@ static struct platform_device mtu2_1_device = {
177}; 171};
178 172
179static struct sh_timer_config mtu2_2_platform_data = { 173static struct sh_timer_config mtu2_2_platform_data = {
180 .name = "MTU2_2",
181 .channel_offset = 0x80, 174 .channel_offset = 0x80,
182 .timer_bit = 2, 175 .timer_bit = 2,
183 .clk = "peripheral_clk",
184 .clockevent_rating = 200, 176 .clockevent_rating = 200,
185}; 177};
186 178
187static struct resource mtu2_2_resources[] = { 179static struct resource mtu2_2_resources[] = {
188 [0] = { 180 [0] = {
189 .name = "MTU2_2",
190 .start = 0xff801000, 181 .start = 0xff801000,
191 .end = 0xff80100a, 182 .end = 0xff80100a,
192 .flags = IORESOURCE_MEM, 183 .flags = IORESOURCE_MEM,
diff --git a/arch/sh/kernel/cpu/sh2a/setup-sh7201.c b/arch/sh/kernel/cpu/sh2a/setup-sh7201.c
index 4ccfeb59eb1a..d08bf4c07d60 100644
--- a/arch/sh/kernel/cpu/sh2a/setup-sh7201.c
+++ b/arch/sh/kernel/cpu/sh2a/setup-sh7201.c
@@ -318,16 +318,13 @@ static struct platform_device rtc_device = {
318}; 318};
319 319
320static struct sh_timer_config mtu2_0_platform_data = { 320static struct sh_timer_config mtu2_0_platform_data = {
321 .name = "MTU2_0",
322 .channel_offset = -0x80, 321 .channel_offset = -0x80,
323 .timer_bit = 0, 322 .timer_bit = 0,
324 .clk = "peripheral_clk",
325 .clockevent_rating = 200, 323 .clockevent_rating = 200,
326}; 324};
327 325
328static struct resource mtu2_0_resources[] = { 326static struct resource mtu2_0_resources[] = {
329 [0] = { 327 [0] = {
330 .name = "MTU2_0",
331 .start = 0xfffe4300, 328 .start = 0xfffe4300,
332 .end = 0xfffe4326, 329 .end = 0xfffe4326,
333 .flags = IORESOURCE_MEM, 330 .flags = IORESOURCE_MEM,
@@ -349,16 +346,13 @@ static struct platform_device mtu2_0_device = {
349}; 346};
350 347
351static struct sh_timer_config mtu2_1_platform_data = { 348static struct sh_timer_config mtu2_1_platform_data = {
352 .name = "MTU2_1",
353 .channel_offset = -0x100, 349 .channel_offset = -0x100,
354 .timer_bit = 1, 350 .timer_bit = 1,
355 .clk = "peripheral_clk",
356 .clockevent_rating = 200, 351 .clockevent_rating = 200,
357}; 352};
358 353
359static struct resource mtu2_1_resources[] = { 354static struct resource mtu2_1_resources[] = {
360 [0] = { 355 [0] = {
361 .name = "MTU2_1",
362 .start = 0xfffe4380, 356 .start = 0xfffe4380,
363 .end = 0xfffe4390, 357 .end = 0xfffe4390,
364 .flags = IORESOURCE_MEM, 358 .flags = IORESOURCE_MEM,
@@ -380,16 +374,13 @@ static struct platform_device mtu2_1_device = {
380}; 374};
381 375
382static struct sh_timer_config mtu2_2_platform_data = { 376static struct sh_timer_config mtu2_2_platform_data = {
383 .name = "MTU2_2",
384 .channel_offset = 0x80, 377 .channel_offset = 0x80,
385 .timer_bit = 2, 378 .timer_bit = 2,
386 .clk = "peripheral_clk",
387 .clockevent_rating = 200, 379 .clockevent_rating = 200,
388}; 380};
389 381
390static struct resource mtu2_2_resources[] = { 382static struct resource mtu2_2_resources[] = {
391 [0] = { 383 [0] = {
392 .name = "MTU2_2",
393 .start = 0xfffe4000, 384 .start = 0xfffe4000,
394 .end = 0xfffe400a, 385 .end = 0xfffe400a,
395 .flags = IORESOURCE_MEM, 386 .flags = IORESOURCE_MEM,
diff --git a/arch/sh/kernel/cpu/sh2a/setup-sh7203.c b/arch/sh/kernel/cpu/sh2a/setup-sh7203.c
index 3136966cc9b3..832f401b5860 100644
--- a/arch/sh/kernel/cpu/sh2a/setup-sh7203.c
+++ b/arch/sh/kernel/cpu/sh2a/setup-sh7203.c
@@ -234,17 +234,14 @@ static struct platform_device scif3_device = {
234}; 234};
235 235
236static struct sh_timer_config cmt0_platform_data = { 236static struct sh_timer_config cmt0_platform_data = {
237 .name = "CMT0",
238 .channel_offset = 0x02, 237 .channel_offset = 0x02,
239 .timer_bit = 0, 238 .timer_bit = 0,
240 .clk = "peripheral_clk",
241 .clockevent_rating = 125, 239 .clockevent_rating = 125,
242 .clocksource_rating = 0, /* disabled due to code generation issues */ 240 .clocksource_rating = 0, /* disabled due to code generation issues */
243}; 241};
244 242
245static struct resource cmt0_resources[] = { 243static struct resource cmt0_resources[] = {
246 [0] = { 244 [0] = {
247 .name = "CMT0",
248 .start = 0xfffec002, 245 .start = 0xfffec002,
249 .end = 0xfffec007, 246 .end = 0xfffec007,
250 .flags = IORESOURCE_MEM, 247 .flags = IORESOURCE_MEM,
@@ -266,17 +263,14 @@ static struct platform_device cmt0_device = {
266}; 263};
267 264
268static struct sh_timer_config cmt1_platform_data = { 265static struct sh_timer_config cmt1_platform_data = {
269 .name = "CMT1",
270 .channel_offset = 0x08, 266 .channel_offset = 0x08,
271 .timer_bit = 1, 267 .timer_bit = 1,
272 .clk = "peripheral_clk",
273 .clockevent_rating = 125, 268 .clockevent_rating = 125,
274 .clocksource_rating = 0, /* disabled due to code generation issues */ 269 .clocksource_rating = 0, /* disabled due to code generation issues */
275}; 270};
276 271
277static struct resource cmt1_resources[] = { 272static struct resource cmt1_resources[] = {
278 [0] = { 273 [0] = {
279 .name = "CMT1",
280 .start = 0xfffec008, 274 .start = 0xfffec008,
281 .end = 0xfffec00d, 275 .end = 0xfffec00d,
282 .flags = IORESOURCE_MEM, 276 .flags = IORESOURCE_MEM,
@@ -298,16 +292,13 @@ static struct platform_device cmt1_device = {
298}; 292};
299 293
300static struct sh_timer_config mtu2_0_platform_data = { 294static struct sh_timer_config mtu2_0_platform_data = {
301 .name = "MTU2_0",
302 .channel_offset = -0x80, 295 .channel_offset = -0x80,
303 .timer_bit = 0, 296 .timer_bit = 0,
304 .clk = "peripheral_clk",
305 .clockevent_rating = 200, 297 .clockevent_rating = 200,
306}; 298};
307 299
308static struct resource mtu2_0_resources[] = { 300static struct resource mtu2_0_resources[] = {
309 [0] = { 301 [0] = {
310 .name = "MTU2_0",
311 .start = 0xfffe4300, 302 .start = 0xfffe4300,
312 .end = 0xfffe4326, 303 .end = 0xfffe4326,
313 .flags = IORESOURCE_MEM, 304 .flags = IORESOURCE_MEM,
@@ -329,16 +320,13 @@ static struct platform_device mtu2_0_device = {
329}; 320};
330 321
331static struct sh_timer_config mtu2_1_platform_data = { 322static struct sh_timer_config mtu2_1_platform_data = {
332 .name = "MTU2_1",
333 .channel_offset = -0x100, 323 .channel_offset = -0x100,
334 .timer_bit = 1, 324 .timer_bit = 1,
335 .clk = "peripheral_clk",
336 .clockevent_rating = 200, 325 .clockevent_rating = 200,
337}; 326};
338 327
339static struct resource mtu2_1_resources[] = { 328static struct resource mtu2_1_resources[] = {
340 [0] = { 329 [0] = {
341 .name = "MTU2_1",
342 .start = 0xfffe4380, 330 .start = 0xfffe4380,
343 .end = 0xfffe4390, 331 .end = 0xfffe4390,
344 .flags = IORESOURCE_MEM, 332 .flags = IORESOURCE_MEM,
diff --git a/arch/sh/kernel/cpu/sh2a/setup-sh7206.c b/arch/sh/kernel/cpu/sh2a/setup-sh7206.c
index 064873585a8b..dc47b04e1049 100644
--- a/arch/sh/kernel/cpu/sh2a/setup-sh7206.c
+++ b/arch/sh/kernel/cpu/sh2a/setup-sh7206.c
@@ -194,17 +194,14 @@ static struct platform_device scif3_device = {
194}; 194};
195 195
196static struct sh_timer_config cmt0_platform_data = { 196static struct sh_timer_config cmt0_platform_data = {
197 .name = "CMT0",
198 .channel_offset = 0x02, 197 .channel_offset = 0x02,
199 .timer_bit = 0, 198 .timer_bit = 0,
200 .clk = "peripheral_clk",
201 .clockevent_rating = 125, 199 .clockevent_rating = 125,
202 .clocksource_rating = 0, /* disabled due to code generation issues */ 200 .clocksource_rating = 0, /* disabled due to code generation issues */
203}; 201};
204 202
205static struct resource cmt0_resources[] = { 203static struct resource cmt0_resources[] = {
206 [0] = { 204 [0] = {
207 .name = "CMT0",
208 .start = 0xfffec002, 205 .start = 0xfffec002,
209 .end = 0xfffec007, 206 .end = 0xfffec007,
210 .flags = IORESOURCE_MEM, 207 .flags = IORESOURCE_MEM,
@@ -226,17 +223,14 @@ static struct platform_device cmt0_device = {
226}; 223};
227 224
228static struct sh_timer_config cmt1_platform_data = { 225static struct sh_timer_config cmt1_platform_data = {
229 .name = "CMT1",
230 .channel_offset = 0x08, 226 .channel_offset = 0x08,
231 .timer_bit = 1, 227 .timer_bit = 1,
232 .clk = "peripheral_clk",
233 .clockevent_rating = 125, 228 .clockevent_rating = 125,
234 .clocksource_rating = 0, /* disabled due to code generation issues */ 229 .clocksource_rating = 0, /* disabled due to code generation issues */
235}; 230};
236 231
237static struct resource cmt1_resources[] = { 232static struct resource cmt1_resources[] = {
238 [0] = { 233 [0] = {
239 .name = "CMT1",
240 .start = 0xfffec008, 234 .start = 0xfffec008,
241 .end = 0xfffec00d, 235 .end = 0xfffec00d,
242 .flags = IORESOURCE_MEM, 236 .flags = IORESOURCE_MEM,
@@ -258,16 +252,13 @@ static struct platform_device cmt1_device = {
258}; 252};
259 253
260static struct sh_timer_config mtu2_0_platform_data = { 254static struct sh_timer_config mtu2_0_platform_data = {
261 .name = "MTU2_0",
262 .channel_offset = -0x80, 255 .channel_offset = -0x80,
263 .timer_bit = 0, 256 .timer_bit = 0,
264 .clk = "peripheral_clk",
265 .clockevent_rating = 200, 257 .clockevent_rating = 200,
266}; 258};
267 259
268static struct resource mtu2_0_resources[] = { 260static struct resource mtu2_0_resources[] = {
269 [0] = { 261 [0] = {
270 .name = "MTU2_0",
271 .start = 0xfffe4300, 262 .start = 0xfffe4300,
272 .end = 0xfffe4326, 263 .end = 0xfffe4326,
273 .flags = IORESOURCE_MEM, 264 .flags = IORESOURCE_MEM,
@@ -289,16 +280,13 @@ static struct platform_device mtu2_0_device = {
289}; 280};
290 281
291static struct sh_timer_config mtu2_1_platform_data = { 282static struct sh_timer_config mtu2_1_platform_data = {
292 .name = "MTU2_1",
293 .channel_offset = -0x100, 283 .channel_offset = -0x100,
294 .timer_bit = 1, 284 .timer_bit = 1,
295 .clk = "peripheral_clk",
296 .clockevent_rating = 200, 285 .clockevent_rating = 200,
297}; 286};
298 287
299static struct resource mtu2_1_resources[] = { 288static struct resource mtu2_1_resources[] = {
300 [0] = { 289 [0] = {
301 .name = "MTU2_1",
302 .start = 0xfffe4380, 290 .start = 0xfffe4380,
303 .end = 0xfffe4390, 291 .end = 0xfffe4390,
304 .flags = IORESOURCE_MEM, 292 .flags = IORESOURCE_MEM,
@@ -320,16 +308,13 @@ static struct platform_device mtu2_1_device = {
320}; 308};
321 309
322static struct sh_timer_config mtu2_2_platform_data = { 310static struct sh_timer_config mtu2_2_platform_data = {
323 .name = "MTU2_2",
324 .channel_offset = 0x80, 311 .channel_offset = 0x80,
325 .timer_bit = 2, 312 .timer_bit = 2,
326 .clk = "peripheral_clk",
327 .clockevent_rating = 200, 313 .clockevent_rating = 200,
328}; 314};
329 315
330static struct resource mtu2_2_resources[] = { 316static struct resource mtu2_2_resources[] = {
331 [0] = { 317 [0] = {
332 .name = "MTU2_2",
333 .start = 0xfffe4000, 318 .start = 0xfffe4000,
334 .end = 0xfffe400a, 319 .end = 0xfffe400a,
335 .flags = IORESOURCE_MEM, 320 .flags = IORESOURCE_MEM,
diff --git a/arch/sh/kernel/cpu/sh3/setup-sh7705.c b/arch/sh/kernel/cpu/sh3/setup-sh7705.c
index 7b892d60e3a0..baadd7f54d94 100644
--- a/arch/sh/kernel/cpu/sh3/setup-sh7705.c
+++ b/arch/sh/kernel/cpu/sh3/setup-sh7705.c
@@ -124,16 +124,13 @@ static struct platform_device rtc_device = {
124}; 124};
125 125
126static struct sh_timer_config tmu0_platform_data = { 126static struct sh_timer_config tmu0_platform_data = {
127 .name = "TMU0",
128 .channel_offset = 0x02, 127 .channel_offset = 0x02,
129 .timer_bit = 0, 128 .timer_bit = 0,
130 .clk = "peripheral_clk",
131 .clockevent_rating = 200, 129 .clockevent_rating = 200,
132}; 130};
133 131
134static struct resource tmu0_resources[] = { 132static struct resource tmu0_resources[] = {
135 [0] = { 133 [0] = {
136 .name = "TMU0",
137 .start = 0xfffffe94, 134 .start = 0xfffffe94,
138 .end = 0xfffffe9f, 135 .end = 0xfffffe9f,
139 .flags = IORESOURCE_MEM, 136 .flags = IORESOURCE_MEM,
@@ -155,16 +152,13 @@ static struct platform_device tmu0_device = {
155}; 152};
156 153
157static struct sh_timer_config tmu1_platform_data = { 154static struct sh_timer_config tmu1_platform_data = {
158 .name = "TMU1",
159 .channel_offset = 0xe, 155 .channel_offset = 0xe,
160 .timer_bit = 1, 156 .timer_bit = 1,
161 .clk = "peripheral_clk",
162 .clocksource_rating = 200, 157 .clocksource_rating = 200,
163}; 158};
164 159
165static struct resource tmu1_resources[] = { 160static struct resource tmu1_resources[] = {
166 [0] = { 161 [0] = {
167 .name = "TMU1",
168 .start = 0xfffffea0, 162 .start = 0xfffffea0,
169 .end = 0xfffffeab, 163 .end = 0xfffffeab,
170 .flags = IORESOURCE_MEM, 164 .flags = IORESOURCE_MEM,
@@ -186,15 +180,12 @@ static struct platform_device tmu1_device = {
186}; 180};
187 181
188static struct sh_timer_config tmu2_platform_data = { 182static struct sh_timer_config tmu2_platform_data = {
189 .name = "TMU2",
190 .channel_offset = 0x1a, 183 .channel_offset = 0x1a,
191 .timer_bit = 2, 184 .timer_bit = 2,
192 .clk = "peripheral_clk",
193}; 185};
194 186
195static struct resource tmu2_resources[] = { 187static struct resource tmu2_resources[] = {
196 [0] = { 188 [0] = {
197 .name = "TMU2",
198 .start = 0xfffffeac, 189 .start = 0xfffffeac,
199 .end = 0xfffffebb, 190 .end = 0xfffffebb,
200 .flags = IORESOURCE_MEM, 191 .flags = IORESOURCE_MEM,
diff --git a/arch/sh/kernel/cpu/sh3/setup-sh770x.c b/arch/sh/kernel/cpu/sh3/setup-sh770x.c
index bc0c4f68c7c7..3cf8c8ef7b32 100644
--- a/arch/sh/kernel/cpu/sh3/setup-sh770x.c
+++ b/arch/sh/kernel/cpu/sh3/setup-sh770x.c
@@ -157,16 +157,13 @@ static struct platform_device scif2_device = {
157#endif 157#endif
158 158
159static struct sh_timer_config tmu0_platform_data = { 159static struct sh_timer_config tmu0_platform_data = {
160 .name = "TMU0",
161 .channel_offset = 0x02, 160 .channel_offset = 0x02,
162 .timer_bit = 0, 161 .timer_bit = 0,
163 .clk = "peripheral_clk",
164 .clockevent_rating = 200, 162 .clockevent_rating = 200,
165}; 163};
166 164
167static struct resource tmu0_resources[] = { 165static struct resource tmu0_resources[] = {
168 [0] = { 166 [0] = {
169 .name = "TMU0",
170 .start = 0xfffffe94, 167 .start = 0xfffffe94,
171 .end = 0xfffffe9f, 168 .end = 0xfffffe9f,
172 .flags = IORESOURCE_MEM, 169 .flags = IORESOURCE_MEM,
@@ -188,16 +185,13 @@ static struct platform_device tmu0_device = {
188}; 185};
189 186
190static struct sh_timer_config tmu1_platform_data = { 187static struct sh_timer_config tmu1_platform_data = {
191 .name = "TMU1",
192 .channel_offset = 0xe, 188 .channel_offset = 0xe,
193 .timer_bit = 1, 189 .timer_bit = 1,
194 .clk = "peripheral_clk",
195 .clocksource_rating = 200, 190 .clocksource_rating = 200,
196}; 191};
197 192
198static struct resource tmu1_resources[] = { 193static struct resource tmu1_resources[] = {
199 [0] = { 194 [0] = {
200 .name = "TMU1",
201 .start = 0xfffffea0, 195 .start = 0xfffffea0,
202 .end = 0xfffffeab, 196 .end = 0xfffffeab,
203 .flags = IORESOURCE_MEM, 197 .flags = IORESOURCE_MEM,
@@ -219,15 +213,12 @@ static struct platform_device tmu1_device = {
219}; 213};
220 214
221static struct sh_timer_config tmu2_platform_data = { 215static struct sh_timer_config tmu2_platform_data = {
222 .name = "TMU2",
223 .channel_offset = 0x1a, 216 .channel_offset = 0x1a,
224 .timer_bit = 2, 217 .timer_bit = 2,
225 .clk = "peripheral_clk",
226}; 218};
227 219
228static struct resource tmu2_resources[] = { 220static struct resource tmu2_resources[] = {
229 [0] = { 221 [0] = {
230 .name = "TMU2",
231 .start = 0xfffffeac, 222 .start = 0xfffffeac,
232 .end = 0xfffffebb, 223 .end = 0xfffffebb,
233 .flags = IORESOURCE_MEM, 224 .flags = IORESOURCE_MEM,
diff --git a/arch/sh/kernel/cpu/sh3/setup-sh7710.c b/arch/sh/kernel/cpu/sh3/setup-sh7710.c
index 0845a3ad006d..b0c2fb4ab479 100644
--- a/arch/sh/kernel/cpu/sh3/setup-sh7710.c
+++ b/arch/sh/kernel/cpu/sh3/setup-sh7710.c
@@ -127,16 +127,13 @@ static struct platform_device scif1_device = {
127}; 127};
128 128
129static struct sh_timer_config tmu0_platform_data = { 129static struct sh_timer_config tmu0_platform_data = {
130 .name = "TMU0",
131 .channel_offset = 0x02, 130 .channel_offset = 0x02,
132 .timer_bit = 0, 131 .timer_bit = 0,
133 .clk = "peripheral_clk",
134 .clockevent_rating = 200, 132 .clockevent_rating = 200,
135}; 133};
136 134
137static struct resource tmu0_resources[] = { 135static struct resource tmu0_resources[] = {
138 [0] = { 136 [0] = {
139 .name = "TMU0",
140 .start = 0xa412fe94, 137 .start = 0xa412fe94,
141 .end = 0xa412fe9f, 138 .end = 0xa412fe9f,
142 .flags = IORESOURCE_MEM, 139 .flags = IORESOURCE_MEM,
@@ -158,16 +155,13 @@ static struct platform_device tmu0_device = {
158}; 155};
159 156
160static struct sh_timer_config tmu1_platform_data = { 157static struct sh_timer_config tmu1_platform_data = {
161 .name = "TMU1",
162 .channel_offset = 0xe, 158 .channel_offset = 0xe,
163 .timer_bit = 1, 159 .timer_bit = 1,
164 .clk = "peripheral_clk",
165 .clocksource_rating = 200, 160 .clocksource_rating = 200,
166}; 161};
167 162
168static struct resource tmu1_resources[] = { 163static struct resource tmu1_resources[] = {
169 [0] = { 164 [0] = {
170 .name = "TMU1",
171 .start = 0xa412fea0, 165 .start = 0xa412fea0,
172 .end = 0xa412feab, 166 .end = 0xa412feab,
173 .flags = IORESOURCE_MEM, 167 .flags = IORESOURCE_MEM,
@@ -189,15 +183,12 @@ static struct platform_device tmu1_device = {
189}; 183};
190 184
191static struct sh_timer_config tmu2_platform_data = { 185static struct sh_timer_config tmu2_platform_data = {
192 .name = "TMU2",
193 .channel_offset = 0x1a, 186 .channel_offset = 0x1a,
194 .timer_bit = 2, 187 .timer_bit = 2,
195 .clk = "peripheral_clk",
196}; 188};
197 189
198static struct resource tmu2_resources[] = { 190static struct resource tmu2_resources[] = {
199 [0] = { 191 [0] = {
200 .name = "TMU2",
201 .start = 0xa412feac, 192 .start = 0xa412feac,
202 .end = 0xa412feb5, 193 .end = 0xa412feb5,
203 .flags = IORESOURCE_MEM, 194 .flags = IORESOURCE_MEM,
diff --git a/arch/sh/kernel/cpu/sh3/setup-sh7720.c b/arch/sh/kernel/cpu/sh3/setup-sh7720.c
index a718a6231091..24b17135d5d2 100644
--- a/arch/sh/kernel/cpu/sh3/setup-sh7720.c
+++ b/arch/sh/kernel/cpu/sh3/setup-sh7720.c
@@ -130,17 +130,14 @@ static struct platform_device usbf_device = {
130}; 130};
131 131
132static struct sh_timer_config cmt0_platform_data = { 132static struct sh_timer_config cmt0_platform_data = {
133 .name = "CMT0",
134 .channel_offset = 0x10, 133 .channel_offset = 0x10,
135 .timer_bit = 0, 134 .timer_bit = 0,
136 .clk = "peripheral_clk",
137 .clockevent_rating = 125, 135 .clockevent_rating = 125,
138 .clocksource_rating = 125, 136 .clocksource_rating = 125,
139}; 137};
140 138
141static struct resource cmt0_resources[] = { 139static struct resource cmt0_resources[] = {
142 [0] = { 140 [0] = {
143 .name = "CMT0",
144 .start = 0x044a0010, 141 .start = 0x044a0010,
145 .end = 0x044a001b, 142 .end = 0x044a001b,
146 .flags = IORESOURCE_MEM, 143 .flags = IORESOURCE_MEM,
@@ -162,15 +159,12 @@ static struct platform_device cmt0_device = {
162}; 159};
163 160
164static struct sh_timer_config cmt1_platform_data = { 161static struct sh_timer_config cmt1_platform_data = {
165 .name = "CMT1",
166 .channel_offset = 0x20, 162 .channel_offset = 0x20,
167 .timer_bit = 1, 163 .timer_bit = 1,
168 .clk = "peripheral_clk",
169}; 164};
170 165
171static struct resource cmt1_resources[] = { 166static struct resource cmt1_resources[] = {
172 [0] = { 167 [0] = {
173 .name = "CMT1",
174 .start = 0x044a0020, 168 .start = 0x044a0020,
175 .end = 0x044a002b, 169 .end = 0x044a002b,
176 .flags = IORESOURCE_MEM, 170 .flags = IORESOURCE_MEM,
@@ -192,15 +186,12 @@ static struct platform_device cmt1_device = {
192}; 186};
193 187
194static struct sh_timer_config cmt2_platform_data = { 188static struct sh_timer_config cmt2_platform_data = {
195 .name = "CMT2",
196 .channel_offset = 0x30, 189 .channel_offset = 0x30,
197 .timer_bit = 2, 190 .timer_bit = 2,
198 .clk = "peripheral_clk",
199}; 191};
200 192
201static struct resource cmt2_resources[] = { 193static struct resource cmt2_resources[] = {
202 [0] = { 194 [0] = {
203 .name = "CMT2",
204 .start = 0x044a0030, 195 .start = 0x044a0030,
205 .end = 0x044a003b, 196 .end = 0x044a003b,
206 .flags = IORESOURCE_MEM, 197 .flags = IORESOURCE_MEM,
@@ -222,15 +213,12 @@ static struct platform_device cmt2_device = {
222}; 213};
223 214
224static struct sh_timer_config cmt3_platform_data = { 215static struct sh_timer_config cmt3_platform_data = {
225 .name = "CMT3",
226 .channel_offset = 0x40, 216 .channel_offset = 0x40,
227 .timer_bit = 3, 217 .timer_bit = 3,
228 .clk = "peripheral_clk",
229}; 218};
230 219
231static struct resource cmt3_resources[] = { 220static struct resource cmt3_resources[] = {
232 [0] = { 221 [0] = {
233 .name = "CMT3",
234 .start = 0x044a0040, 222 .start = 0x044a0040,
235 .end = 0x044a004b, 223 .end = 0x044a004b,
236 .flags = IORESOURCE_MEM, 224 .flags = IORESOURCE_MEM,
@@ -252,15 +240,12 @@ static struct platform_device cmt3_device = {
252}; 240};
253 241
254static struct sh_timer_config cmt4_platform_data = { 242static struct sh_timer_config cmt4_platform_data = {
255 .name = "CMT4",
256 .channel_offset = 0x50, 243 .channel_offset = 0x50,
257 .timer_bit = 4, 244 .timer_bit = 4,
258 .clk = "peripheral_clk",
259}; 245};
260 246
261static struct resource cmt4_resources[] = { 247static struct resource cmt4_resources[] = {
262 [0] = { 248 [0] = {
263 .name = "CMT4",
264 .start = 0x044a0050, 249 .start = 0x044a0050,
265 .end = 0x044a005b, 250 .end = 0x044a005b,
266 .flags = IORESOURCE_MEM, 251 .flags = IORESOURCE_MEM,
@@ -282,16 +267,13 @@ static struct platform_device cmt4_device = {
282}; 267};
283 268
284static struct sh_timer_config tmu0_platform_data = { 269static struct sh_timer_config tmu0_platform_data = {
285 .name = "TMU0",
286 .channel_offset = 0x02, 270 .channel_offset = 0x02,
287 .timer_bit = 0, 271 .timer_bit = 0,
288 .clk = "peripheral_clk",
289 .clockevent_rating = 200, 272 .clockevent_rating = 200,
290}; 273};
291 274
292static struct resource tmu0_resources[] = { 275static struct resource tmu0_resources[] = {
293 [0] = { 276 [0] = {
294 .name = "TMU0",
295 .start = 0xa412fe94, 277 .start = 0xa412fe94,
296 .end = 0xa412fe9f, 278 .end = 0xa412fe9f,
297 .flags = IORESOURCE_MEM, 279 .flags = IORESOURCE_MEM,
@@ -313,16 +295,13 @@ static struct platform_device tmu0_device = {
313}; 295};
314 296
315static struct sh_timer_config tmu1_platform_data = { 297static struct sh_timer_config tmu1_platform_data = {
316 .name = "TMU1",
317 .channel_offset = 0xe, 298 .channel_offset = 0xe,
318 .timer_bit = 1, 299 .timer_bit = 1,
319 .clk = "peripheral_clk",
320 .clocksource_rating = 200, 300 .clocksource_rating = 200,
321}; 301};
322 302
323static struct resource tmu1_resources[] = { 303static struct resource tmu1_resources[] = {
324 [0] = { 304 [0] = {
325 .name = "TMU1",
326 .start = 0xa412fea0, 305 .start = 0xa412fea0,
327 .end = 0xa412feab, 306 .end = 0xa412feab,
328 .flags = IORESOURCE_MEM, 307 .flags = IORESOURCE_MEM,
@@ -344,15 +323,12 @@ static struct platform_device tmu1_device = {
344}; 323};
345 324
346static struct sh_timer_config tmu2_platform_data = { 325static struct sh_timer_config tmu2_platform_data = {
347 .name = "TMU2",
348 .channel_offset = 0x1a, 326 .channel_offset = 0x1a,
349 .timer_bit = 2, 327 .timer_bit = 2,
350 .clk = "peripheral_clk",
351}; 328};
352 329
353static struct resource tmu2_resources[] = { 330static struct resource tmu2_resources[] = {
354 [0] = { 331 [0] = {
355 .name = "TMU2",
356 .start = 0xa412feac, 332 .start = 0xa412feac,
357 .end = 0xa412feb5, 333 .end = 0xa412feb5,
358 .flags = IORESOURCE_MEM, 334 .flags = IORESOURCE_MEM,
diff --git a/arch/sh/kernel/cpu/sh4/setup-sh4-202.c b/arch/sh/kernel/cpu/sh4/setup-sh4-202.c
index b9b7e10ad68f..e916b18e1f7c 100644
--- a/arch/sh/kernel/cpu/sh4/setup-sh4-202.c
+++ b/arch/sh/kernel/cpu/sh4/setup-sh4-202.c
@@ -31,16 +31,13 @@ static struct platform_device scif0_device = {
31}; 31};
32 32
33static struct sh_timer_config tmu0_platform_data = { 33static struct sh_timer_config tmu0_platform_data = {
34 .name = "TMU0",
35 .channel_offset = 0x04, 34 .channel_offset = 0x04,
36 .timer_bit = 0, 35 .timer_bit = 0,
37 .clk = "peripheral_clk",
38 .clockevent_rating = 200, 36 .clockevent_rating = 200,
39}; 37};
40 38
41static struct resource tmu0_resources[] = { 39static struct resource tmu0_resources[] = {
42 [0] = { 40 [0] = {
43 .name = "TMU0",
44 .start = 0xffd80008, 41 .start = 0xffd80008,
45 .end = 0xffd80013, 42 .end = 0xffd80013,
46 .flags = IORESOURCE_MEM, 43 .flags = IORESOURCE_MEM,
@@ -62,16 +59,13 @@ static struct platform_device tmu0_device = {
62}; 59};
63 60
64static struct sh_timer_config tmu1_platform_data = { 61static struct sh_timer_config tmu1_platform_data = {
65 .name = "TMU1",
66 .channel_offset = 0x10, 62 .channel_offset = 0x10,
67 .timer_bit = 1, 63 .timer_bit = 1,
68 .clk = "peripheral_clk",
69 .clocksource_rating = 200, 64 .clocksource_rating = 200,
70}; 65};
71 66
72static struct resource tmu1_resources[] = { 67static struct resource tmu1_resources[] = {
73 [0] = { 68 [0] = {
74 .name = "TMU1",
75 .start = 0xffd80014, 69 .start = 0xffd80014,
76 .end = 0xffd8001f, 70 .end = 0xffd8001f,
77 .flags = IORESOURCE_MEM, 71 .flags = IORESOURCE_MEM,
@@ -93,15 +87,12 @@ static struct platform_device tmu1_device = {
93}; 87};
94 88
95static struct sh_timer_config tmu2_platform_data = { 89static struct sh_timer_config tmu2_platform_data = {
96 .name = "TMU2",
97 .channel_offset = 0x1c, 90 .channel_offset = 0x1c,
98 .timer_bit = 2, 91 .timer_bit = 2,
99 .clk = "peripheral_clk",
100}; 92};
101 93
102static struct resource tmu2_resources[] = { 94static struct resource tmu2_resources[] = {
103 [0] = { 95 [0] = {
104 .name = "TMU2",
105 .start = 0xffd80020, 96 .start = 0xffd80020,
106 .end = 0xffd8002f, 97 .end = 0xffd8002f,
107 .flags = IORESOURCE_MEM, 98 .flags = IORESOURCE_MEM,
diff --git a/arch/sh/kernel/cpu/sh4/setup-sh7750.c b/arch/sh/kernel/cpu/sh4/setup-sh7750.c
index ffd79e57254f..911d196e86b5 100644
--- a/arch/sh/kernel/cpu/sh4/setup-sh7750.c
+++ b/arch/sh/kernel/cpu/sh4/setup-sh7750.c
@@ -66,16 +66,13 @@ static struct platform_device scif1_device = {
66}; 66};
67 67
68static struct sh_timer_config tmu0_platform_data = { 68static struct sh_timer_config tmu0_platform_data = {
69 .name = "TMU0",
70 .channel_offset = 0x04, 69 .channel_offset = 0x04,
71 .timer_bit = 0, 70 .timer_bit = 0,
72 .clk = "peripheral_clk",
73 .clockevent_rating = 200, 71 .clockevent_rating = 200,
74}; 72};
75 73
76static struct resource tmu0_resources[] = { 74static struct resource tmu0_resources[] = {
77 [0] = { 75 [0] = {
78 .name = "TMU0",
79 .start = 0xffd80008, 76 .start = 0xffd80008,
80 .end = 0xffd80013, 77 .end = 0xffd80013,
81 .flags = IORESOURCE_MEM, 78 .flags = IORESOURCE_MEM,
@@ -97,16 +94,13 @@ static struct platform_device tmu0_device = {
97}; 94};
98 95
99static struct sh_timer_config tmu1_platform_data = { 96static struct sh_timer_config tmu1_platform_data = {
100 .name = "TMU1",
101 .channel_offset = 0x10, 97 .channel_offset = 0x10,
102 .timer_bit = 1, 98 .timer_bit = 1,
103 .clk = "peripheral_clk",
104 .clocksource_rating = 200, 99 .clocksource_rating = 200,
105}; 100};
106 101
107static struct resource tmu1_resources[] = { 102static struct resource tmu1_resources[] = {
108 [0] = { 103 [0] = {
109 .name = "TMU1",
110 .start = 0xffd80014, 104 .start = 0xffd80014,
111 .end = 0xffd8001f, 105 .end = 0xffd8001f,
112 .flags = IORESOURCE_MEM, 106 .flags = IORESOURCE_MEM,
@@ -128,15 +122,12 @@ static struct platform_device tmu1_device = {
128}; 122};
129 123
130static struct sh_timer_config tmu2_platform_data = { 124static struct sh_timer_config tmu2_platform_data = {
131 .name = "TMU2",
132 .channel_offset = 0x1c, 125 .channel_offset = 0x1c,
133 .timer_bit = 2, 126 .timer_bit = 2,
134 .clk = "peripheral_clk",
135}; 127};
136 128
137static struct resource tmu2_resources[] = { 129static struct resource tmu2_resources[] = {
138 [0] = { 130 [0] = {
139 .name = "TMU2",
140 .start = 0xffd80020, 131 .start = 0xffd80020,
141 .end = 0xffd8002f, 132 .end = 0xffd8002f,
142 .flags = IORESOURCE_MEM, 133 .flags = IORESOURCE_MEM,
@@ -163,15 +154,12 @@ static struct platform_device tmu2_device = {
163 defined(CONFIG_CPU_SUBTYPE_SH7751R) 154 defined(CONFIG_CPU_SUBTYPE_SH7751R)
164 155
165static struct sh_timer_config tmu3_platform_data = { 156static struct sh_timer_config tmu3_platform_data = {
166 .name = "TMU3",
167 .channel_offset = 0x04, 157 .channel_offset = 0x04,
168 .timer_bit = 0, 158 .timer_bit = 0,
169 .clk = "peripheral_clk",
170}; 159};
171 160
172static struct resource tmu3_resources[] = { 161static struct resource tmu3_resources[] = {
173 [0] = { 162 [0] = {
174 .name = "TMU3",
175 .start = 0xfe100008, 163 .start = 0xfe100008,
176 .end = 0xfe100013, 164 .end = 0xfe100013,
177 .flags = IORESOURCE_MEM, 165 .flags = IORESOURCE_MEM,
@@ -193,15 +181,12 @@ static struct platform_device tmu3_device = {
193}; 181};
194 182
195static struct sh_timer_config tmu4_platform_data = { 183static struct sh_timer_config tmu4_platform_data = {
196 .name = "TMU4",
197 .channel_offset = 0x10, 184 .channel_offset = 0x10,
198 .timer_bit = 1, 185 .timer_bit = 1,
199 .clk = "peripheral_clk",
200}; 186};
201 187
202static struct resource tmu4_resources[] = { 188static struct resource tmu4_resources[] = {
203 [0] = { 189 [0] = {
204 .name = "TMU4",
205 .start = 0xfe100014, 190 .start = 0xfe100014,
206 .end = 0xfe10001f, 191 .end = 0xfe10001f,
207 .flags = IORESOURCE_MEM, 192 .flags = IORESOURCE_MEM,
diff --git a/arch/sh/kernel/cpu/sh4/setup-sh7760.c b/arch/sh/kernel/cpu/sh4/setup-sh7760.c
index a16eb3656f4b..48ea8fe85dc5 100644
--- a/arch/sh/kernel/cpu/sh4/setup-sh7760.c
+++ b/arch/sh/kernel/cpu/sh4/setup-sh7760.c
@@ -187,16 +187,13 @@ static struct platform_device scif3_device = {
187}; 187};
188 188
189static struct sh_timer_config tmu0_platform_data = { 189static struct sh_timer_config tmu0_platform_data = {
190 .name = "TMU0",
191 .channel_offset = 0x04, 190 .channel_offset = 0x04,
192 .timer_bit = 0, 191 .timer_bit = 0,
193 .clk = "peripheral_clk",
194 .clockevent_rating = 200, 192 .clockevent_rating = 200,
195}; 193};
196 194
197static struct resource tmu0_resources[] = { 195static struct resource tmu0_resources[] = {
198 [0] = { 196 [0] = {
199 .name = "TMU0",
200 .start = 0xffd80008, 197 .start = 0xffd80008,
201 .end = 0xffd80013, 198 .end = 0xffd80013,
202 .flags = IORESOURCE_MEM, 199 .flags = IORESOURCE_MEM,
@@ -218,16 +215,13 @@ static struct platform_device tmu0_device = {
218}; 215};
219 216
220static struct sh_timer_config tmu1_platform_data = { 217static struct sh_timer_config tmu1_platform_data = {
221 .name = "TMU1",
222 .channel_offset = 0x10, 218 .channel_offset = 0x10,
223 .timer_bit = 1, 219 .timer_bit = 1,
224 .clk = "peripheral_clk",
225 .clocksource_rating = 200, 220 .clocksource_rating = 200,
226}; 221};
227 222
228static struct resource tmu1_resources[] = { 223static struct resource tmu1_resources[] = {
229 [0] = { 224 [0] = {
230 .name = "TMU1",
231 .start = 0xffd80014, 225 .start = 0xffd80014,
232 .end = 0xffd8001f, 226 .end = 0xffd8001f,
233 .flags = IORESOURCE_MEM, 227 .flags = IORESOURCE_MEM,
@@ -249,15 +243,12 @@ static struct platform_device tmu1_device = {
249}; 243};
250 244
251static struct sh_timer_config tmu2_platform_data = { 245static struct sh_timer_config tmu2_platform_data = {
252 .name = "TMU2",
253 .channel_offset = 0x1c, 246 .channel_offset = 0x1c,
254 .timer_bit = 2, 247 .timer_bit = 2,
255 .clk = "peripheral_clk",
256}; 248};
257 249
258static struct resource tmu2_resources[] = { 250static struct resource tmu2_resources[] = {
259 [0] = { 251 [0] = {
260 .name = "TMU2",
261 .start = 0xffd80020, 252 .start = 0xffd80020,
262 .end = 0xffd8002f, 253 .end = 0xffd8002f,
263 .flags = IORESOURCE_MEM, 254 .flags = IORESOURCE_MEM,
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7343.c b/arch/sh/kernel/cpu/sh4a/clock-sh7343.c
index 2c16df37eda6..a63cdcaee0b2 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7343.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7343.c
@@ -154,15 +154,15 @@ static struct clk mstp_clks[] = {
154 MSTP("sh0", &div4_clks[DIV4_P], MSTPCR0, 20, 0), 154 MSTP("sh0", &div4_clks[DIV4_P], MSTPCR0, 20, 0),
155 MSTP("hudi0", &div4_clks[DIV4_P], MSTPCR0, 19, 0), 155 MSTP("hudi0", &div4_clks[DIV4_P], MSTPCR0, 19, 0),
156 MSTP("ubc0", &div4_clks[DIV4_P], MSTPCR0, 17, 0), 156 MSTP("ubc0", &div4_clks[DIV4_P], MSTPCR0, 17, 0),
157 MSTP("tmu0", &div4_clks[DIV4_P], MSTPCR0, 15, 0), 157 MSTP("tmu_fck", &div4_clks[DIV4_P], MSTPCR0, 15, 0),
158 MSTP("cmt0", &r_clk, MSTPCR0, 14, 0), 158 MSTP("cmt_fck", &r_clk, MSTPCR0, 14, 0),
159 MSTP("rwdt0", &r_clk, MSTPCR0, 13, 0), 159 MSTP("rwdt0", &r_clk, MSTPCR0, 13, 0),
160 MSTP("mfi0", &div4_clks[DIV4_P], MSTPCR0, 11, 0), 160 MSTP("mfi0", &div4_clks[DIV4_P], MSTPCR0, 11, 0),
161 MSTP("flctl0", &div4_clks[DIV4_P], MSTPCR0, 10, 0), 161 MSTP("flctl0", &div4_clks[DIV4_P], MSTPCR0, 10, 0),
162 MSTP("scif0", &div4_clks[DIV4_P], MSTPCR0, 7, 0), 162 SH_CLK_MSTP32("sci_fck", 0, &div4_clks[DIV4_P], MSTPCR0, 7, 0),
163 MSTP("scif1", &div4_clks[DIV4_P], MSTPCR0, 6, 0), 163 SH_CLK_MSTP32("sci_fck", 1, &div4_clks[DIV4_P], MSTPCR0, 6, 0),
164 MSTP("scif2", &div4_clks[DIV4_P], MSTPCR0, 5, 0), 164 SH_CLK_MSTP32("sci_fck", 2, &div4_clks[DIV4_P], MSTPCR0, 5, 0),
165 MSTP("scif3", &div4_clks[DIV4_P], MSTPCR0, 4, 0), 165 SH_CLK_MSTP32("sci_fck", 3, &div4_clks[DIV4_P], MSTPCR0, 4, 0),
166 MSTP("sio0", &div4_clks[DIV4_P], MSTPCR0, 3, 0), 166 MSTP("sio0", &div4_clks[DIV4_P], MSTPCR0, 3, 0),
167 MSTP("siof0", &div4_clks[DIV4_P], MSTPCR0, 2, 0), 167 MSTP("siof0", &div4_clks[DIV4_P], MSTPCR0, 2, 0),
168 MSTP("siof1", &div4_clks[DIV4_P], MSTPCR0, 1, 0), 168 MSTP("siof1", &div4_clks[DIV4_P], MSTPCR0, 1, 0),
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7366.c b/arch/sh/kernel/cpu/sh4a/clock-sh7366.c
index 91588d280cd8..f99db94cf8fb 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7366.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7366.c
@@ -158,14 +158,14 @@ static struct clk mstp_clks[] = {
158 MSTP("sh0", &div4_clks[DIV4_P], MSTPCR0, 20, 0), 158 MSTP("sh0", &div4_clks[DIV4_P], MSTPCR0, 20, 0),
159 MSTP("hudi0", &div4_clks[DIV4_P], MSTPCR0, 19, 0), 159 MSTP("hudi0", &div4_clks[DIV4_P], MSTPCR0, 19, 0),
160 MSTP("ubc0", &div4_clks[DIV4_P], MSTPCR0, 17, 0), 160 MSTP("ubc0", &div4_clks[DIV4_P], MSTPCR0, 17, 0),
161 MSTP("tmu0", &div4_clks[DIV4_P], MSTPCR0, 15, 0), 161 MSTP("tmu_fck", &div4_clks[DIV4_P], MSTPCR0, 15, 0),
162 MSTP("cmt0", &r_clk, MSTPCR0, 14, 0), 162 MSTP("cmt_fck", &r_clk, MSTPCR0, 14, 0),
163 MSTP("rwdt0", &r_clk, MSTPCR0, 13, 0), 163 MSTP("rwdt0", &r_clk, MSTPCR0, 13, 0),
164 MSTP("mfi0", &div4_clks[DIV4_P], MSTPCR0, 11, 0), 164 MSTP("mfi0", &div4_clks[DIV4_P], MSTPCR0, 11, 0),
165 MSTP("flctl0", &div4_clks[DIV4_P], MSTPCR0, 10, 0), 165 MSTP("flctl0", &div4_clks[DIV4_P], MSTPCR0, 10, 0),
166 MSTP("scif0", &div4_clks[DIV4_P], MSTPCR0, 7, 0), 166 SH_CLK_MSTP32("sci_fck", 0, &div4_clks[DIV4_P], MSTPCR0, 7, 0),
167 MSTP("scif1", &div4_clks[DIV4_P], MSTPCR0, 6, 0), 167 SH_CLK_MSTP32("sci_fck", 1, &div4_clks[DIV4_P], MSTPCR0, 6, 0),
168 MSTP("scif2", &div4_clks[DIV4_P], MSTPCR0, 5, 0), 168 SH_CLK_MSTP32("sci_fck", 2, &div4_clks[DIV4_P], MSTPCR0, 5, 0),
169 MSTP("msiof0", &div4_clks[DIV4_P], MSTPCR0, 2, 0), 169 MSTP("msiof0", &div4_clks[DIV4_P], MSTPCR0, 2, 0),
170 MSTP("sbr0", &div4_clks[DIV4_P], MSTPCR0, 1, 0), 170 MSTP("sbr0", &div4_clks[DIV4_P], MSTPCR0, 1, 0),
171 171
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
index 15db6d521c5c..107b200e78bd 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
@@ -160,13 +160,13 @@ struct clk div6_clks[] = {
160static struct clk mstp_clks[] = { 160static struct clk mstp_clks[] = {
161 SH_HWBLK_CLK("uram0", -1, U_CLK, HWBLK_URAM, CLK_ENABLE_ON_INIT), 161 SH_HWBLK_CLK("uram0", -1, U_CLK, HWBLK_URAM, CLK_ENABLE_ON_INIT),
162 SH_HWBLK_CLK("xymem0", -1, B_CLK, HWBLK_XYMEM, CLK_ENABLE_ON_INIT), 162 SH_HWBLK_CLK("xymem0", -1, B_CLK, HWBLK_XYMEM, CLK_ENABLE_ON_INIT),
163 SH_HWBLK_CLK("tmu0", -1, P_CLK, HWBLK_TMU, 0), 163 SH_HWBLK_CLK("tmu_fck", -1, P_CLK, HWBLK_TMU, 0),
164 SH_HWBLK_CLK("cmt0", -1, R_CLK, HWBLK_CMT, 0), 164 SH_HWBLK_CLK("cmt_fck", -1, R_CLK, HWBLK_CMT, 0),
165 SH_HWBLK_CLK("rwdt0", -1, R_CLK, HWBLK_RWDT, 0), 165 SH_HWBLK_CLK("rwdt0", -1, R_CLK, HWBLK_RWDT, 0),
166 SH_HWBLK_CLK("flctl0", -1, P_CLK, HWBLK_FLCTL, 0), 166 SH_HWBLK_CLK("flctl0", -1, P_CLK, HWBLK_FLCTL, 0),
167 SH_HWBLK_CLK("scif0", -1, P_CLK, HWBLK_SCIF0, 0), 167 SH_HWBLK_CLK("sci_fck", 0, P_CLK, HWBLK_SCIF0, 0),
168 SH_HWBLK_CLK("scif1", -1, P_CLK, HWBLK_SCIF1, 0), 168 SH_HWBLK_CLK("sci_fck", 1, P_CLK, HWBLK_SCIF1, 0),
169 SH_HWBLK_CLK("scif2", -1, P_CLK, HWBLK_SCIF2, 0), 169 SH_HWBLK_CLK("sci_fck", 2, P_CLK, HWBLK_SCIF2, 0),
170 170
171 SH_HWBLK_CLK("i2c0", -1, P_CLK, HWBLK_IIC, 0), 171 SH_HWBLK_CLK("i2c0", -1, P_CLK, HWBLK_IIC, 0),
172 SH_HWBLK_CLK("rtc0", -1, R_CLK, HWBLK_RTC, 0), 172 SH_HWBLK_CLK("rtc0", -1, R_CLK, HWBLK_RTC, 0),
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7723.c b/arch/sh/kernel/cpu/sh4a/clock-sh7723.c
index 50babe01fe44..fc86c88223f4 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7723.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7723.c
@@ -21,6 +21,8 @@
21#include <linux/init.h> 21#include <linux/init.h>
22#include <linux/kernel.h> 22#include <linux/kernel.h>
23#include <linux/io.h> 23#include <linux/io.h>
24#include <linux/clk.h>
25#include <asm/clkdev.h>
24#include <asm/clock.h> 26#include <asm/clock.h>
25#include <asm/hwblk.h> 27#include <asm/hwblk.h>
26#include <cpu/sh7723.h> 28#include <cpu/sh7723.h>
@@ -171,18 +173,18 @@ static struct clk mstp_clks[] = {
171 SH_HWBLK_CLK("sh0", -1, SH_CLK, HWBLK_SHYWAY, CLK_ENABLE_ON_INIT), 173 SH_HWBLK_CLK("sh0", -1, SH_CLK, HWBLK_SHYWAY, CLK_ENABLE_ON_INIT),
172 SH_HWBLK_CLK("hudi0", -1, P_CLK, HWBLK_HUDI, 0), 174 SH_HWBLK_CLK("hudi0", -1, P_CLK, HWBLK_HUDI, 0),
173 SH_HWBLK_CLK("ubc0", -1, I_CLK, HWBLK_UBC, 0), 175 SH_HWBLK_CLK("ubc0", -1, I_CLK, HWBLK_UBC, 0),
174 SH_HWBLK_CLK("tmu0", -1, P_CLK, HWBLK_TMU0, 0), 176 SH_HWBLK_CLK("tmu012_fck", -1, P_CLK, HWBLK_TMU0, 0),
175 SH_HWBLK_CLK("cmt0", -1, R_CLK, HWBLK_CMT, 0), 177 SH_HWBLK_CLK("cmt_fck", -1, R_CLK, HWBLK_CMT, 0),
176 SH_HWBLK_CLK("rwdt0", -1, R_CLK, HWBLK_RWDT, 0), 178 SH_HWBLK_CLK("rwdt0", -1, R_CLK, HWBLK_RWDT, 0),
177 SH_HWBLK_CLK("dmac1", -1, B_CLK, HWBLK_DMAC1, 0), 179 SH_HWBLK_CLK("dmac1", -1, B_CLK, HWBLK_DMAC1, 0),
178 SH_HWBLK_CLK("tmu1", -1, P_CLK, HWBLK_TMU1, 0), 180 SH_HWBLK_CLK("tmu345_fck", -1, P_CLK, HWBLK_TMU1, 0),
179 SH_HWBLK_CLK("flctl0", -1, P_CLK, HWBLK_FLCTL, 0), 181 SH_HWBLK_CLK("flctl0", -1, P_CLK, HWBLK_FLCTL, 0),
180 SH_HWBLK_CLK("scif0", -1, P_CLK, HWBLK_SCIF0, 0), 182 SH_HWBLK_CLK("sci_fck", 0, P_CLK, HWBLK_SCIF0, 0),
181 SH_HWBLK_CLK("scif1", -1, P_CLK, HWBLK_SCIF1, 0), 183 SH_HWBLK_CLK("sci_fck", 1, P_CLK, HWBLK_SCIF1, 0),
182 SH_HWBLK_CLK("scif2", -1, P_CLK, HWBLK_SCIF2, 0), 184 SH_HWBLK_CLK("sci_fck", 2, P_CLK, HWBLK_SCIF2, 0),
183 SH_HWBLK_CLK("scif3", -1, B_CLK, HWBLK_SCIF3, 0), 185 SH_HWBLK_CLK("sci_fck", 3, B_CLK, HWBLK_SCIF3, 0),
184 SH_HWBLK_CLK("scif4", -1, B_CLK, HWBLK_SCIF4, 0), 186 SH_HWBLK_CLK("sci_fck", 4, B_CLK, HWBLK_SCIF4, 0),
185 SH_HWBLK_CLK("scif5", -1, B_CLK, HWBLK_SCIF5, 0), 187 SH_HWBLK_CLK("sci_fck", 5, B_CLK, HWBLK_SCIF5, 0),
186 SH_HWBLK_CLK("msiof0", -1, B_CLK, HWBLK_MSIOF0, 0), 188 SH_HWBLK_CLK("msiof0", -1, B_CLK, HWBLK_MSIOF0, 0),
187 SH_HWBLK_CLK("msiof1", -1, B_CLK, HWBLK_MSIOF1, 0), 189 SH_HWBLK_CLK("msiof1", -1, B_CLK, HWBLK_MSIOF1, 0),
188 SH_HWBLK_CLK("meram0", -1, SH_CLK, HWBLK_MERAM, 0), 190 SH_HWBLK_CLK("meram0", -1, SH_CLK, HWBLK_MERAM, 0),
@@ -211,6 +213,40 @@ static struct clk mstp_clks[] = {
211 SH_HWBLK_CLK("lcdc0", -1, B_CLK, HWBLK_LCDC, 0), 213 SH_HWBLK_CLK("lcdc0", -1, B_CLK, HWBLK_LCDC, 0),
212}; 214};
213 215
216static struct clk_lookup lookups[] = {
217 {
218 /* TMU0 */
219 .dev_id = "sh_tmu.0",
220 .con_id = "tmu_fck",
221 .clk = &mstp_clks[11], /* tmu012_fck */
222 }, {
223 /* TMU1 */
224 .dev_id = "sh_tmu.1",
225 .con_id = "tmu_fck",
226 .clk = &mstp_clks[11],
227 }, {
228 /* TMU2 */
229 .dev_id = "sh_tmu.2",
230 .con_id = "tmu_fck",
231 .clk = &mstp_clks[11],
232 }, {
233 /* TMU3 */
234 .dev_id = "sh_tmu.3",
235 .con_id = "tmu_fck",
236 .clk = &mstp_clks[15], /* tmu345_fck */
237 }, {
238 /* TMU4 */
239 .dev_id = "sh_tmu.4",
240 .con_id = "tmu_fck",
241 .clk = &mstp_clks[15],
242 }, {
243 /* TMU5 */
244 .dev_id = "sh_tmu.5",
245 .con_id = "tmu_fck",
246 .clk = &mstp_clks[15],
247 },
248};
249
214int __init arch_clk_init(void) 250int __init arch_clk_init(void)
215{ 251{
216 int k, ret = 0; 252 int k, ret = 0;
@@ -222,7 +258,9 @@ int __init arch_clk_init(void)
222 pll_clk.parent = &extal_clk; 258 pll_clk.parent = &extal_clk;
223 259
224 for (k = 0; !ret && (k < ARRAY_SIZE(main_clks)); k++) 260 for (k = 0; !ret && (k < ARRAY_SIZE(main_clks)); k++)
225 ret = clk_register(main_clks[k]); 261 ret |= clk_register(main_clks[k]);
262
263 clkdev_add_table(lookups, ARRAY_SIZE(lookups));
226 264
227 if (!ret) 265 if (!ret)
228 ret = sh_clk_div4_register(div4_clks, DIV4_NR, &div4_table); 266 ret = sh_clk_div4_register(div4_clks, DIV4_NR, &div4_table);
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7724.c b/arch/sh/kernel/cpu/sh4a/clock-sh7724.c
index 6707061fbf54..f1583a23b3a5 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7724.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7724.c
@@ -21,6 +21,8 @@
21#include <linux/init.h> 21#include <linux/init.h>
22#include <linux/kernel.h> 22#include <linux/kernel.h>
23#include <linux/io.h> 23#include <linux/io.h>
24#include <linux/clk.h>
25#include <asm/clkdev.h>
24#include <asm/clock.h> 26#include <asm/clock.h>
25#include <asm/hwblk.h> 27#include <asm/hwblk.h>
26#include <cpu/sh7724.h> 28#include <cpu/sh7724.h>
@@ -189,17 +191,17 @@ static struct clk mstp_clks[] = {
189 SH_HWBLK_CLK("sh0", -1, SH_CLK, HWBLK_SHYWAY, CLK_ENABLE_ON_INIT), 191 SH_HWBLK_CLK("sh0", -1, SH_CLK, HWBLK_SHYWAY, CLK_ENABLE_ON_INIT),
190 SH_HWBLK_CLK("hudi0", -1, P_CLK, HWBLK_HUDI, 0), 192 SH_HWBLK_CLK("hudi0", -1, P_CLK, HWBLK_HUDI, 0),
191 SH_HWBLK_CLK("ubc0", -1, I_CLK, HWBLK_UBC, 0), 193 SH_HWBLK_CLK("ubc0", -1, I_CLK, HWBLK_UBC, 0),
192 SH_HWBLK_CLK("tmu0", -1, P_CLK, HWBLK_TMU0, 0), 194 SH_HWBLK_CLK("tmu012_fck", -1, P_CLK, HWBLK_TMU0, 0),
193 SH_HWBLK_CLK("cmt0", -1, R_CLK, HWBLK_CMT, 0), 195 SH_HWBLK_CLK("cmt_fck", -1, R_CLK, HWBLK_CMT, 0),
194 SH_HWBLK_CLK("rwdt0", -1, R_CLK, HWBLK_RWDT, 0), 196 SH_HWBLK_CLK("rwdt0", -1, R_CLK, HWBLK_RWDT, 0),
195 SH_HWBLK_CLK("dmac1", -1, B_CLK, HWBLK_DMAC1, 0), 197 SH_HWBLK_CLK("dmac1", -1, B_CLK, HWBLK_DMAC1, 0),
196 SH_HWBLK_CLK("tmu1", -1, P_CLK, HWBLK_TMU1, 0), 198 SH_HWBLK_CLK("tmu345_fck", -1, P_CLK, HWBLK_TMU1, 0),
197 SH_HWBLK_CLK("scif0", -1, P_CLK, HWBLK_SCIF0, 0), 199 SH_HWBLK_CLK("sci_fck", 0, P_CLK, HWBLK_SCIF0, 0),
198 SH_HWBLK_CLK("scif1", -1, P_CLK, HWBLK_SCIF1, 0), 200 SH_HWBLK_CLK("sci_fck", 1, P_CLK, HWBLK_SCIF1, 0),
199 SH_HWBLK_CLK("scif2", -1, P_CLK, HWBLK_SCIF2, 0), 201 SH_HWBLK_CLK("sci_fck", 2, P_CLK, HWBLK_SCIF2, 0),
200 SH_HWBLK_CLK("scif3", -1, B_CLK, HWBLK_SCIF3, 0), 202 SH_HWBLK_CLK("sci_fck", 3, B_CLK, HWBLK_SCIF3, 0),
201 SH_HWBLK_CLK("scif4", -1, B_CLK, HWBLK_SCIF4, 0), 203 SH_HWBLK_CLK("sci_fck", 4, B_CLK, HWBLK_SCIF4, 0),
202 SH_HWBLK_CLK("scif5", -1, B_CLK, HWBLK_SCIF5, 0), 204 SH_HWBLK_CLK("sci_fck", 5, B_CLK, HWBLK_SCIF5, 0),
203 SH_HWBLK_CLK("msiof0", -1, B_CLK, HWBLK_MSIOF0, 0), 205 SH_HWBLK_CLK("msiof0", -1, B_CLK, HWBLK_MSIOF0, 0),
204 SH_HWBLK_CLK("msiof1", -1, B_CLK, HWBLK_MSIOF1, 0), 206 SH_HWBLK_CLK("msiof1", -1, B_CLK, HWBLK_MSIOF1, 0),
205 207
@@ -233,6 +235,40 @@ static struct clk mstp_clks[] = {
233 SH_HWBLK_CLK("lcdc0", -1, B_CLK, HWBLK_LCDC, 0), 235 SH_HWBLK_CLK("lcdc0", -1, B_CLK, HWBLK_LCDC, 0),
234}; 236};
235 237
238static struct clk_lookup lookups[] = {
239 {
240 /* TMU0 */
241 .dev_id = "sh_tmu.0",
242 .con_id = "tmu_fck",
243 .clk = &mstp_clks[12], /* tmu012_fck */
244 }, {
245 /* TMU1 */
246 .dev_id = "sh_tmu.1",
247 .con_id = "tmu_fck",
248 .clk = &mstp_clks[12],
249 }, {
250 /* TMU2 */
251 .dev_id = "sh_tmu.2",
252 .con_id = "tmu_fck",
253 .clk = &mstp_clks[12],
254 }, {
255 /* TMU3 */
256 .dev_id = "sh_tmu.3",
257 .con_id = "tmu_fck",
258 .clk = &mstp_clks[16], /* tmu345_fck */
259 }, {
260 /* TMU4 */
261 .dev_id = "sh_tmu.4",
262 .con_id = "tmu_fck",
263 .clk = &mstp_clks[16],
264 }, {
265 /* TMU5 */
266 .dev_id = "sh_tmu.5",
267 .con_id = "tmu_fck",
268 .clk = &mstp_clks[16],
269 },
270};
271
236int __init arch_clk_init(void) 272int __init arch_clk_init(void)
237{ 273{
238 int k, ret = 0; 274 int k, ret = 0;
@@ -246,6 +282,8 @@ int __init arch_clk_init(void)
246 for (k = 0; !ret && (k < ARRAY_SIZE(main_clks)); k++) 282 for (k = 0; !ret && (k < ARRAY_SIZE(main_clks)); k++)
247 ret = clk_register(main_clks[k]); 283 ret = clk_register(main_clks[k]);
248 284
285 clkdev_add_table(lookups, ARRAY_SIZE(lookups));
286
249 if (!ret) 287 if (!ret)
250 ret = sh_clk_div4_register(div4_clks, DIV4_NR, &div4_table); 288 ret = sh_clk_div4_register(div4_clks, DIV4_NR, &div4_table);
251 289
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
index d997f0a25b10..28de049a59b1 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * SH7785 support for the clock framework 4 * SH7785 support for the clock framework
5 * 5 *
6 * Copyright (C) 2007 - 2009 Paul Mundt 6 * Copyright (C) 2007 - 2010 Paul Mundt
7 * 7 *
8 * This file is subject to the terms and conditions of the GNU General Public 8 * This file is subject to the terms and conditions of the GNU General Public
9 * License. See the file "COPYING" in the main directory of this archive 9 * License. See the file "COPYING" in the main directory of this archive
@@ -14,6 +14,7 @@
14#include <linux/clk.h> 14#include <linux/clk.h>
15#include <linux/io.h> 15#include <linux/io.h>
16#include <linux/cpufreq.h> 16#include <linux/cpufreq.h>
17#include <asm/clkdev.h>
17#include <asm/clock.h> 18#include <asm/clock.h>
18#include <asm/freq.h> 19#include <asm/freq.h>
19#include <cpu/sh7785.h> 20#include <cpu/sh7785.h>
@@ -88,12 +89,12 @@ struct clk div4_clks[DIV4_NR] = {
88 89
89static struct clk mstp_clks[] = { 90static struct clk mstp_clks[] = {
90 /* MSTPCR0 */ 91 /* MSTPCR0 */
91 SH_CLK_MSTP32("scif_fck", 5, &div4_clks[DIV4_P], MSTPCR0, 29, 0), 92 SH_CLK_MSTP32("sci_fck", 5, &div4_clks[DIV4_P], MSTPCR0, 29, 0),
92 SH_CLK_MSTP32("scif_fck", 4, &div4_clks[DIV4_P], MSTPCR0, 28, 0), 93 SH_CLK_MSTP32("sci_fck", 4, &div4_clks[DIV4_P], MSTPCR0, 28, 0),
93 SH_CLK_MSTP32("scif_fck", 3, &div4_clks[DIV4_P], MSTPCR0, 27, 0), 94 SH_CLK_MSTP32("sci_fck", 3, &div4_clks[DIV4_P], MSTPCR0, 27, 0),
94 SH_CLK_MSTP32("scif_fck", 2, &div4_clks[DIV4_P], MSTPCR0, 26, 0), 95 SH_CLK_MSTP32("sci_fck", 2, &div4_clks[DIV4_P], MSTPCR0, 26, 0),
95 SH_CLK_MSTP32("scif_fck", 1, &div4_clks[DIV4_P], MSTPCR0, 25, 0), 96 SH_CLK_MSTP32("sci_fck", 1, &div4_clks[DIV4_P], MSTPCR0, 25, 0),
96 SH_CLK_MSTP32("scif_fck", 0, &div4_clks[DIV4_P], MSTPCR0, 24, 0), 97 SH_CLK_MSTP32("sci_fck", 0, &div4_clks[DIV4_P], MSTPCR0, 24, 0),
97 SH_CLK_MSTP32("ssi_fck", 1, &div4_clks[DIV4_P], MSTPCR0, 21, 0), 98 SH_CLK_MSTP32("ssi_fck", 1, &div4_clks[DIV4_P], MSTPCR0, 21, 0),
98 SH_CLK_MSTP32("ssi_fck", 0, &div4_clks[DIV4_P], MSTPCR0, 20, 0), 99 SH_CLK_MSTP32("ssi_fck", 0, &div4_clks[DIV4_P], MSTPCR0, 20, 0),
99 SH_CLK_MSTP32("hac_fck", 1, &div4_clks[DIV4_P], MSTPCR0, 17, 0), 100 SH_CLK_MSTP32("hac_fck", 1, &div4_clks[DIV4_P], MSTPCR0, 17, 0),
@@ -113,12 +114,48 @@ static struct clk mstp_clks[] = {
113 SH_CLK_MSTP32("gdta_fck", -1, NULL, MSTPCR1, 0, 0), 114 SH_CLK_MSTP32("gdta_fck", -1, NULL, MSTPCR1, 0, 0),
114}; 115};
115 116
117static struct clk_lookup lookups[] = {
118 {
119 /* TMU0 */
120 .dev_id = "sh_tmu.0",
121 .con_id = "tmu_fck",
122 .clk = &mstp_clks[13], /* tmu012_fck */
123 }, {
124 /* TMU1 */
125 .dev_id = "sh_tmu.1",
126 .con_id = "tmu_fck",
127 .clk = &mstp_clks[13],
128 }, {
129 /* TMU2 */
130 .dev_id = "sh_tmu.2",
131 .con_id = "tmu_fck",
132 .clk = &mstp_clks[13],
133 }, {
134 /* TMU3 */
135 .dev_id = "sh_tmu.3",
136 .con_id = "tmu_fck",
137 .clk = &mstp_clks[12], /* tmu345_fck */
138 }, {
139 /* TMU4 */
140 .dev_id = "sh_tmu.4",
141 .con_id = "tmu_fck",
142 .clk = &mstp_clks[12],
143 }, {
144 /* TMU5 */
145 .dev_id = "sh_tmu.5",
146 .con_id = "tmu_fck",
147 .clk = &mstp_clks[12],
148 },
149};
150
116int __init arch_clk_init(void) 151int __init arch_clk_init(void)
117{ 152{
118 int i, ret = 0; 153 int i, ret = 0;
119 154
120 for (i = 0; i < ARRAY_SIZE(clks); i++) 155 for (i = 0; i < ARRAY_SIZE(clks); i++)
121 ret |= clk_register(clks[i]); 156 ret |= clk_register(clks[i]);
157 for (i = 0; i < ARRAY_SIZE(lookups); i++)
158 clkdev_add(&lookups[i]);
122 159
123 if (!ret) 160 if (!ret)
124 ret = sh_clk_div4_register(div4_clks, ARRAY_SIZE(div4_clks), 161 ret = sh_clk_div4_register(div4_clks, ARRAY_SIZE(div4_clks),
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7786.c b/arch/sh/kernel/cpu/sh4a/clock-sh7786.c
index af69fd468703..c4a84bb2f3d9 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7786.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7786.c
@@ -13,6 +13,8 @@
13#include <linux/kernel.h> 13#include <linux/kernel.h>
14#include <linux/clk.h> 14#include <linux/clk.h>
15#include <linux/io.h> 15#include <linux/io.h>
16#include <linux/clk.h>
17#include <asm/clkdev.h>
16#include <asm/clock.h> 18#include <asm/clock.h>
17#include <asm/freq.h> 19#include <asm/freq.h>
18 20
@@ -87,12 +89,12 @@ struct clk div4_clks[DIV4_NR] = {
87 89
88static struct clk mstp_clks[] = { 90static struct clk mstp_clks[] = {
89 /* MSTPCR0 */ 91 /* MSTPCR0 */
90 SH_CLK_MSTP32("scif_fck", 5, &div4_clks[DIV4_P], MSTPCR0, 29, 0), 92 SH_CLK_MSTP32("sci_fck", 5, &div4_clks[DIV4_P], MSTPCR0, 29, 0),
91 SH_CLK_MSTP32("scif_fck", 4, &div4_clks[DIV4_P], MSTPCR0, 28, 0), 93 SH_CLK_MSTP32("sci_fck", 4, &div4_clks[DIV4_P], MSTPCR0, 28, 0),
92 SH_CLK_MSTP32("scif_fck", 3, &div4_clks[DIV4_P], MSTPCR0, 27, 0), 94 SH_CLK_MSTP32("sci_fck", 3, &div4_clks[DIV4_P], MSTPCR0, 27, 0),
93 SH_CLK_MSTP32("scif_fck", 2, &div4_clks[DIV4_P], MSTPCR0, 26, 0), 95 SH_CLK_MSTP32("sci_fck", 2, &div4_clks[DIV4_P], MSTPCR0, 26, 0),
94 SH_CLK_MSTP32("scif_fck", 1, &div4_clks[DIV4_P], MSTPCR0, 25, 0), 96 SH_CLK_MSTP32("sci_fck", 1, &div4_clks[DIV4_P], MSTPCR0, 25, 0),
95 SH_CLK_MSTP32("scif_fck", 0, &div4_clks[DIV4_P], MSTPCR0, 24, 0), 97 SH_CLK_MSTP32("sci_fck", 0, &div4_clks[DIV4_P], MSTPCR0, 24, 0),
96 SH_CLK_MSTP32("ssi_fck", 3, &div4_clks[DIV4_P], MSTPCR0, 23, 0), 98 SH_CLK_MSTP32("ssi_fck", 3, &div4_clks[DIV4_P], MSTPCR0, 23, 0),
97 SH_CLK_MSTP32("ssi_fck", 2, &div4_clks[DIV4_P], MSTPCR0, 22, 0), 99 SH_CLK_MSTP32("ssi_fck", 2, &div4_clks[DIV4_P], MSTPCR0, 22, 0),
98 SH_CLK_MSTP32("ssi_fck", 1, &div4_clks[DIV4_P], MSTPCR0, 21, 0), 100 SH_CLK_MSTP32("ssi_fck", 1, &div4_clks[DIV4_P], MSTPCR0, 21, 0),
@@ -120,12 +122,78 @@ static struct clk mstp_clks[] = {
120 SH_CLK_MSTP32("ether_fck", -1, NULL, MSTPCR1, 2, 0), 122 SH_CLK_MSTP32("ether_fck", -1, NULL, MSTPCR1, 2, 0),
121}; 123};
122 124
125static struct clk_lookup lookups[] = {
126 {
127 /* TMU0 */
128 .dev_id = "sh_tmu.0",
129 .con_id = "tmu_fck",
130 .clk = &mstp_clks[17], /* tmu012_fck */
131 }, {
132 /* TMU1 */
133 .dev_id = "sh_tmu.1",
134 .con_id = "tmu_fck",
135 .clk = &mstp_clks[17],
136 }, {
137 /* TMU2 */
138 .dev_id = "sh_tmu.2",
139 .con_id = "tmu_fck",
140 .clk = &mstp_clks[17],
141 }, {
142 /* TMU3 */
143 .dev_id = "sh_tmu.3",
144 .con_id = "tmu_fck",
145 .clk = &mstp_clks[16], /* tmu345_fck */
146 }, {
147 /* TMU4 */
148 .dev_id = "sh_tmu.4",
149 .con_id = "tmu_fck",
150 .clk = &mstp_clks[16],
151 }, {
152 /* TMU5 */
153 .dev_id = "sh_tmu.5",
154 .con_id = "tmu_fck",
155 .clk = &mstp_clks[16],
156 }, {
157 /* TMU6 */
158 .dev_id = "sh_tmu.6",
159 .con_id = "tmu_fck",
160 .clk = &mstp_clks[15], /* tmu678_fck */
161 }, {
162 /* TMU7 */
163 .dev_id = "sh_tmu.7",
164 .con_id = "tmu_fck",
165 .clk = &mstp_clks[15],
166 }, {
167 /* TMU8 */
168 .dev_id = "sh_tmu.8",
169 .con_id = "tmu_fck",
170 .clk = &mstp_clks[15],
171 }, {
172 /* TMU9 */
173 .dev_id = "sh_tmu.9",
174 .con_id = "tmu_fck",
175 .clk = &mstp_clks[14], /* tmu9_11_fck */
176 }, {
177 /* TMU10 */
178 .dev_id = "sh_tmu.10",
179 .con_id = "tmu_fck",
180 .clk = &mstp_clks[14],
181 }, {
182 /* TMU11 */
183 .dev_id = "sh_tmu.11",
184 .con_id = "tmu_fck",
185 .clk = &mstp_clks[14],
186 }
187};
188
123int __init arch_clk_init(void) 189int __init arch_clk_init(void)
124{ 190{
125 int i, ret = 0; 191 int i, ret = 0;
126 192
127 for (i = 0; i < ARRAY_SIZE(clks); i++) 193 for (i = 0; i < ARRAY_SIZE(clks); i++)
128 ret |= clk_register(clks[i]); 194 ret |= clk_register(clks[i]);
195 for (i = 0; i < ARRAY_SIZE(lookups); i++)
196 clkdev_add(&lookups[i]);
129 197
130 if (!ret) 198 if (!ret)
131 ret = sh_clk_div4_register(div4_clks, ARRAY_SIZE(div4_clks), 199 ret = sh_clk_div4_register(div4_clks, ARRAY_SIZE(div4_clks),
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7343.c b/arch/sh/kernel/cpu/sh4a/setup-sh7343.c
index 45eb1bfd42c9..3681cafdb4af 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7343.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7343.c
@@ -21,7 +21,6 @@ static struct plat_sci_port scif0_platform_data = {
21 .flags = UPF_BOOT_AUTOCONF, 21 .flags = UPF_BOOT_AUTOCONF,
22 .type = PORT_SCIF, 22 .type = PORT_SCIF,
23 .irqs = { 80, 80, 80, 80 }, 23 .irqs = { 80, 80, 80, 80 },
24 .clk = "scif0",
25}; 24};
26 25
27static struct platform_device scif0_device = { 26static struct platform_device scif0_device = {
@@ -37,7 +36,6 @@ static struct plat_sci_port scif1_platform_data = {
37 .flags = UPF_BOOT_AUTOCONF, 36 .flags = UPF_BOOT_AUTOCONF,
38 .type = PORT_SCIF, 37 .type = PORT_SCIF,
39 .irqs = { 81, 81, 81, 81 }, 38 .irqs = { 81, 81, 81, 81 },
40 .clk = "scif1",
41}; 39};
42 40
43static struct platform_device scif1_device = { 41static struct platform_device scif1_device = {
@@ -53,7 +51,6 @@ static struct plat_sci_port scif2_platform_data = {
53 .flags = UPF_BOOT_AUTOCONF, 51 .flags = UPF_BOOT_AUTOCONF,
54 .type = PORT_SCIF, 52 .type = PORT_SCIF,
55 .irqs = { 82, 82, 82, 82 }, 53 .irqs = { 82, 82, 82, 82 },
56 .clk = "scif2",
57}; 54};
58 55
59static struct platform_device scif2_device = { 56static struct platform_device scif2_device = {
@@ -69,7 +66,6 @@ static struct plat_sci_port scif3_platform_data = {
69 .flags = UPF_BOOT_AUTOCONF, 66 .flags = UPF_BOOT_AUTOCONF,
70 .type = PORT_SCIF, 67 .type = PORT_SCIF,
71 .irqs = { 83, 83, 83, 83 }, 68 .irqs = { 83, 83, 83, 83 },
72 .clk = "scif3",
73}; 69};
74 70
75static struct platform_device scif3_device = { 71static struct platform_device scif3_device = {
@@ -207,17 +203,14 @@ static struct platform_device jpu_device = {
207}; 203};
208 204
209static struct sh_timer_config cmt_platform_data = { 205static struct sh_timer_config cmt_platform_data = {
210 .name = "CMT",
211 .channel_offset = 0x60, 206 .channel_offset = 0x60,
212 .timer_bit = 5, 207 .timer_bit = 5,
213 .clk = "cmt0",
214 .clockevent_rating = 125, 208 .clockevent_rating = 125,
215 .clocksource_rating = 200, 209 .clocksource_rating = 200,
216}; 210};
217 211
218static struct resource cmt_resources[] = { 212static struct resource cmt_resources[] = {
219 [0] = { 213 [0] = {
220 .name = "CMT",
221 .start = 0x044a0060, 214 .start = 0x044a0060,
222 .end = 0x044a006b, 215 .end = 0x044a006b,
223 .flags = IORESOURCE_MEM, 216 .flags = IORESOURCE_MEM,
@@ -239,16 +232,13 @@ static struct platform_device cmt_device = {
239}; 232};
240 233
241static struct sh_timer_config tmu0_platform_data = { 234static struct sh_timer_config tmu0_platform_data = {
242 .name = "TMU0",
243 .channel_offset = 0x04, 235 .channel_offset = 0x04,
244 .timer_bit = 0, 236 .timer_bit = 0,
245 .clk = "tmu0",
246 .clockevent_rating = 200, 237 .clockevent_rating = 200,
247}; 238};
248 239
249static struct resource tmu0_resources[] = { 240static struct resource tmu0_resources[] = {
250 [0] = { 241 [0] = {
251 .name = "TMU0",
252 .start = 0xffd80008, 242 .start = 0xffd80008,
253 .end = 0xffd80013, 243 .end = 0xffd80013,
254 .flags = IORESOURCE_MEM, 244 .flags = IORESOURCE_MEM,
@@ -270,16 +260,13 @@ static struct platform_device tmu0_device = {
270}; 260};
271 261
272static struct sh_timer_config tmu1_platform_data = { 262static struct sh_timer_config tmu1_platform_data = {
273 .name = "TMU1",
274 .channel_offset = 0x10, 263 .channel_offset = 0x10,
275 .timer_bit = 1, 264 .timer_bit = 1,
276 .clk = "tmu0",
277 .clocksource_rating = 200, 265 .clocksource_rating = 200,
278}; 266};
279 267
280static struct resource tmu1_resources[] = { 268static struct resource tmu1_resources[] = {
281 [0] = { 269 [0] = {
282 .name = "TMU1",
283 .start = 0xffd80014, 270 .start = 0xffd80014,
284 .end = 0xffd8001f, 271 .end = 0xffd8001f,
285 .flags = IORESOURCE_MEM, 272 .flags = IORESOURCE_MEM,
@@ -301,15 +288,12 @@ static struct platform_device tmu1_device = {
301}; 288};
302 289
303static struct sh_timer_config tmu2_platform_data = { 290static struct sh_timer_config tmu2_platform_data = {
304 .name = "TMU2",
305 .channel_offset = 0x1c, 291 .channel_offset = 0x1c,
306 .timer_bit = 2, 292 .timer_bit = 2,
307 .clk = "tmu0",
308}; 293};
309 294
310static struct resource tmu2_resources[] = { 295static struct resource tmu2_resources[] = {
311 [0] = { 296 [0] = {
312 .name = "TMU2",
313 .start = 0xffd80020, 297 .start = 0xffd80020,
314 .end = 0xffd8002b, 298 .end = 0xffd8002b,
315 .flags = IORESOURCE_MEM, 299 .flags = IORESOURCE_MEM,
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7366.c b/arch/sh/kernel/cpu/sh4a/setup-sh7366.c
index c494c193e3b6..8dab9e1bbd89 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7366.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7366.c
@@ -23,7 +23,6 @@ static struct plat_sci_port scif0_platform_data = {
23 .flags = UPF_BOOT_AUTOCONF, 23 .flags = UPF_BOOT_AUTOCONF,
24 .type = PORT_SCIF, 24 .type = PORT_SCIF,
25 .irqs = { 80, 80, 80, 80 }, 25 .irqs = { 80, 80, 80, 80 },
26 .clk = "scif0",
27}; 26};
28 27
29static struct platform_device scif0_device = { 28static struct platform_device scif0_device = {
@@ -169,17 +168,14 @@ static struct platform_device veu1_device = {
169}; 168};
170 169
171static struct sh_timer_config cmt_platform_data = { 170static struct sh_timer_config cmt_platform_data = {
172 .name = "CMT",
173 .channel_offset = 0x60, 171 .channel_offset = 0x60,
174 .timer_bit = 5, 172 .timer_bit = 5,
175 .clk = "cmt0",
176 .clockevent_rating = 125, 173 .clockevent_rating = 125,
177 .clocksource_rating = 200, 174 .clocksource_rating = 200,
178}; 175};
179 176
180static struct resource cmt_resources[] = { 177static struct resource cmt_resources[] = {
181 [0] = { 178 [0] = {
182 .name = "CMT",
183 .start = 0x044a0060, 179 .start = 0x044a0060,
184 .end = 0x044a006b, 180 .end = 0x044a006b,
185 .flags = IORESOURCE_MEM, 181 .flags = IORESOURCE_MEM,
@@ -201,16 +197,13 @@ static struct platform_device cmt_device = {
201}; 197};
202 198
203static struct sh_timer_config tmu0_platform_data = { 199static struct sh_timer_config tmu0_platform_data = {
204 .name = "TMU0",
205 .channel_offset = 0x04, 200 .channel_offset = 0x04,
206 .timer_bit = 0, 201 .timer_bit = 0,
207 .clk = "tmu0",
208 .clockevent_rating = 200, 202 .clockevent_rating = 200,
209}; 203};
210 204
211static struct resource tmu0_resources[] = { 205static struct resource tmu0_resources[] = {
212 [0] = { 206 [0] = {
213 .name = "TMU0",
214 .start = 0xffd80008, 207 .start = 0xffd80008,
215 .end = 0xffd80013, 208 .end = 0xffd80013,
216 .flags = IORESOURCE_MEM, 209 .flags = IORESOURCE_MEM,
@@ -232,16 +225,13 @@ static struct platform_device tmu0_device = {
232}; 225};
233 226
234static struct sh_timer_config tmu1_platform_data = { 227static struct sh_timer_config tmu1_platform_data = {
235 .name = "TMU1",
236 .channel_offset = 0x10, 228 .channel_offset = 0x10,
237 .timer_bit = 1, 229 .timer_bit = 1,
238 .clk = "tmu0",
239 .clocksource_rating = 200, 230 .clocksource_rating = 200,
240}; 231};
241 232
242static struct resource tmu1_resources[] = { 233static struct resource tmu1_resources[] = {
243 [0] = { 234 [0] = {
244 .name = "TMU1",
245 .start = 0xffd80014, 235 .start = 0xffd80014,
246 .end = 0xffd8001f, 236 .end = 0xffd8001f,
247 .flags = IORESOURCE_MEM, 237 .flags = IORESOURCE_MEM,
@@ -263,15 +253,12 @@ static struct platform_device tmu1_device = {
263}; 253};
264 254
265static struct sh_timer_config tmu2_platform_data = { 255static struct sh_timer_config tmu2_platform_data = {
266 .name = "TMU2",
267 .channel_offset = 0x1c, 256 .channel_offset = 0x1c,
268 .timer_bit = 2, 257 .timer_bit = 2,
269 .clk = "tmu0",
270}; 258};
271 259
272static struct resource tmu2_resources[] = { 260static struct resource tmu2_resources[] = {
273 [0] = { 261 [0] = {
274 .name = "TMU2",
275 .start = 0xffd80020, 262 .start = 0xffd80020,
276 .end = 0xffd8002b, 263 .end = 0xffd8002b,
277 .flags = IORESOURCE_MEM, 264 .flags = IORESOURCE_MEM,
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
index fd7e3639e845..dc9b30d086a4 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
@@ -174,7 +174,6 @@ static struct plat_sci_port scif0_platform_data = {
174 .flags = UPF_BOOT_AUTOCONF, 174 .flags = UPF_BOOT_AUTOCONF,
175 .type = PORT_SCIF, 175 .type = PORT_SCIF,
176 .irqs = { 80, 80, 80, 80 }, 176 .irqs = { 80, 80, 80, 80 },
177 .clk = "scif0",
178}; 177};
179 178
180static struct platform_device scif0_device = { 179static struct platform_device scif0_device = {
@@ -190,7 +189,6 @@ static struct plat_sci_port scif1_platform_data = {
190 .flags = UPF_BOOT_AUTOCONF, 189 .flags = UPF_BOOT_AUTOCONF,
191 .type = PORT_SCIF, 190 .type = PORT_SCIF,
192 .irqs = { 81, 81, 81, 81 }, 191 .irqs = { 81, 81, 81, 81 },
193 .clk = "scif1",
194}; 192};
195 193
196static struct platform_device scif1_device = { 194static struct platform_device scif1_device = {
@@ -206,7 +204,6 @@ static struct plat_sci_port scif2_platform_data = {
206 .flags = UPF_BOOT_AUTOCONF, 204 .flags = UPF_BOOT_AUTOCONF,
207 .type = PORT_SCIF, 205 .type = PORT_SCIF,
208 .irqs = { 82, 82, 82, 82 }, 206 .irqs = { 82, 82, 82, 82 },
209 .clk = "scif2",
210}; 207};
211 208
212static struct platform_device scif2_device = { 209static struct platform_device scif2_device = {
@@ -401,17 +398,14 @@ static struct platform_device jpu_device = {
401}; 398};
402 399
403static struct sh_timer_config cmt_platform_data = { 400static struct sh_timer_config cmt_platform_data = {
404 .name = "CMT",
405 .channel_offset = 0x60, 401 .channel_offset = 0x60,
406 .timer_bit = 5, 402 .timer_bit = 5,
407 .clk = "cmt0",
408 .clockevent_rating = 125, 403 .clockevent_rating = 125,
409 .clocksource_rating = 125, 404 .clocksource_rating = 125,
410}; 405};
411 406
412static struct resource cmt_resources[] = { 407static struct resource cmt_resources[] = {
413 [0] = { 408 [0] = {
414 .name = "CMT",
415 .start = 0x044a0060, 409 .start = 0x044a0060,
416 .end = 0x044a006b, 410 .end = 0x044a006b,
417 .flags = IORESOURCE_MEM, 411 .flags = IORESOURCE_MEM,
@@ -436,16 +430,13 @@ static struct platform_device cmt_device = {
436}; 430};
437 431
438static struct sh_timer_config tmu0_platform_data = { 432static struct sh_timer_config tmu0_platform_data = {
439 .name = "TMU0",
440 .channel_offset = 0x04, 433 .channel_offset = 0x04,
441 .timer_bit = 0, 434 .timer_bit = 0,
442 .clk = "tmu0",
443 .clockevent_rating = 200, 435 .clockevent_rating = 200,
444}; 436};
445 437
446static struct resource tmu0_resources[] = { 438static struct resource tmu0_resources[] = {
447 [0] = { 439 [0] = {
448 .name = "TMU0",
449 .start = 0xffd80008, 440 .start = 0xffd80008,
450 .end = 0xffd80013, 441 .end = 0xffd80013,
451 .flags = IORESOURCE_MEM, 442 .flags = IORESOURCE_MEM,
@@ -470,16 +461,13 @@ static struct platform_device tmu0_device = {
470}; 461};
471 462
472static struct sh_timer_config tmu1_platform_data = { 463static struct sh_timer_config tmu1_platform_data = {
473 .name = "TMU1",
474 .channel_offset = 0x10, 464 .channel_offset = 0x10,
475 .timer_bit = 1, 465 .timer_bit = 1,
476 .clk = "tmu0",
477 .clocksource_rating = 200, 466 .clocksource_rating = 200,
478}; 467};
479 468
480static struct resource tmu1_resources[] = { 469static struct resource tmu1_resources[] = {
481 [0] = { 470 [0] = {
482 .name = "TMU1",
483 .start = 0xffd80014, 471 .start = 0xffd80014,
484 .end = 0xffd8001f, 472 .end = 0xffd8001f,
485 .flags = IORESOURCE_MEM, 473 .flags = IORESOURCE_MEM,
@@ -504,15 +492,12 @@ static struct platform_device tmu1_device = {
504}; 492};
505 493
506static struct sh_timer_config tmu2_platform_data = { 494static struct sh_timer_config tmu2_platform_data = {
507 .name = "TMU2",
508 .channel_offset = 0x1c, 495 .channel_offset = 0x1c,
509 .timer_bit = 2, 496 .timer_bit = 2,
510 .clk = "tmu0",
511}; 497};
512 498
513static struct resource tmu2_resources[] = { 499static struct resource tmu2_resources[] = {
514 [0] = { 500 [0] = {
515 .name = "TMU2",
516 .start = 0xffd80020, 501 .start = 0xffd80020,
517 .end = 0xffd8002b, 502 .end = 0xffd8002b,
518 .flags = IORESOURCE_MEM, 503 .flags = IORESOURCE_MEM,
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7723.c b/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
index 85c61f624702..0eadefdbbba1 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
@@ -26,7 +26,6 @@ static struct plat_sci_port scif0_platform_data = {
26 .flags = UPF_BOOT_AUTOCONF, 26 .flags = UPF_BOOT_AUTOCONF,
27 .type = PORT_SCIF, 27 .type = PORT_SCIF,
28 .irqs = { 80, 80, 80, 80 }, 28 .irqs = { 80, 80, 80, 80 },
29 .clk = "scif0",
30}; 29};
31 30
32static struct platform_device scif0_device = { 31static struct platform_device scif0_device = {
@@ -42,7 +41,6 @@ static struct plat_sci_port scif1_platform_data = {
42 .flags = UPF_BOOT_AUTOCONF, 41 .flags = UPF_BOOT_AUTOCONF,
43 .type = PORT_SCIF, 42 .type = PORT_SCIF,
44 .irqs = { 81, 81, 81, 81 }, 43 .irqs = { 81, 81, 81, 81 },
45 .clk = "scif1",
46}; 44};
47 45
48static struct platform_device scif1_device = { 46static struct platform_device scif1_device = {
@@ -58,7 +56,6 @@ static struct plat_sci_port scif2_platform_data = {
58 .flags = UPF_BOOT_AUTOCONF, 56 .flags = UPF_BOOT_AUTOCONF,
59 .type = PORT_SCIF, 57 .type = PORT_SCIF,
60 .irqs = { 82, 82, 82, 82 }, 58 .irqs = { 82, 82, 82, 82 },
61 .clk = "scif2",
62}; 59};
63 60
64static struct platform_device scif2_device = { 61static struct platform_device scif2_device = {
@@ -74,7 +71,6 @@ static struct plat_sci_port scif3_platform_data = {
74 .flags = UPF_BOOT_AUTOCONF, 71 .flags = UPF_BOOT_AUTOCONF,
75 .type = PORT_SCIFA, 72 .type = PORT_SCIFA,
76 .irqs = { 56, 56, 56, 56 }, 73 .irqs = { 56, 56, 56, 56 },
77 .clk = "scif3",
78}; 74};
79 75
80static struct platform_device scif3_device = { 76static struct platform_device scif3_device = {
@@ -90,7 +86,6 @@ static struct plat_sci_port scif4_platform_data = {
90 .flags = UPF_BOOT_AUTOCONF, 86 .flags = UPF_BOOT_AUTOCONF,
91 .type = PORT_SCIFA, 87 .type = PORT_SCIFA,
92 .irqs = { 88, 88, 88, 88 }, 88 .irqs = { 88, 88, 88, 88 },
93 .clk = "scif4",
94}; 89};
95 90
96static struct platform_device scif4_device = { 91static struct platform_device scif4_device = {
@@ -106,7 +101,6 @@ static struct plat_sci_port scif5_platform_data = {
106 .flags = UPF_BOOT_AUTOCONF, 101 .flags = UPF_BOOT_AUTOCONF,
107 .type = PORT_SCIFA, 102 .type = PORT_SCIFA,
108 .irqs = { 109, 109, 109, 109 }, 103 .irqs = { 109, 109, 109, 109 },
109 .clk = "scif5",
110}; 104};
111 105
112static struct platform_device scif5_device = { 106static struct platform_device scif5_device = {
@@ -211,17 +205,14 @@ static struct platform_device veu1_device = {
211}; 205};
212 206
213static struct sh_timer_config cmt_platform_data = { 207static struct sh_timer_config cmt_platform_data = {
214 .name = "CMT",
215 .channel_offset = 0x60, 208 .channel_offset = 0x60,
216 .timer_bit = 5, 209 .timer_bit = 5,
217 .clk = "cmt0",
218 .clockevent_rating = 125, 210 .clockevent_rating = 125,
219 .clocksource_rating = 125, 211 .clocksource_rating = 125,
220}; 212};
221 213
222static struct resource cmt_resources[] = { 214static struct resource cmt_resources[] = {
223 [0] = { 215 [0] = {
224 .name = "CMT",
225 .start = 0x044a0060, 216 .start = 0x044a0060,
226 .end = 0x044a006b, 217 .end = 0x044a006b,
227 .flags = IORESOURCE_MEM, 218 .flags = IORESOURCE_MEM,
@@ -246,16 +237,13 @@ static struct platform_device cmt_device = {
246}; 237};
247 238
248static struct sh_timer_config tmu0_platform_data = { 239static struct sh_timer_config tmu0_platform_data = {
249 .name = "TMU0",
250 .channel_offset = 0x04, 240 .channel_offset = 0x04,
251 .timer_bit = 0, 241 .timer_bit = 0,
252 .clk = "tmu0",
253 .clockevent_rating = 200, 242 .clockevent_rating = 200,
254}; 243};
255 244
256static struct resource tmu0_resources[] = { 245static struct resource tmu0_resources[] = {
257 [0] = { 246 [0] = {
258 .name = "TMU0",
259 .start = 0xffd80008, 247 .start = 0xffd80008,
260 .end = 0xffd80013, 248 .end = 0xffd80013,
261 .flags = IORESOURCE_MEM, 249 .flags = IORESOURCE_MEM,
@@ -280,16 +268,13 @@ static struct platform_device tmu0_device = {
280}; 268};
281 269
282static struct sh_timer_config tmu1_platform_data = { 270static struct sh_timer_config tmu1_platform_data = {
283 .name = "TMU1",
284 .channel_offset = 0x10, 271 .channel_offset = 0x10,
285 .timer_bit = 1, 272 .timer_bit = 1,
286 .clk = "tmu0",
287 .clocksource_rating = 200, 273 .clocksource_rating = 200,
288}; 274};
289 275
290static struct resource tmu1_resources[] = { 276static struct resource tmu1_resources[] = {
291 [0] = { 277 [0] = {
292 .name = "TMU1",
293 .start = 0xffd80014, 278 .start = 0xffd80014,
294 .end = 0xffd8001f, 279 .end = 0xffd8001f,
295 .flags = IORESOURCE_MEM, 280 .flags = IORESOURCE_MEM,
@@ -314,15 +299,12 @@ static struct platform_device tmu1_device = {
314}; 299};
315 300
316static struct sh_timer_config tmu2_platform_data = { 301static struct sh_timer_config tmu2_platform_data = {
317 .name = "TMU2",
318 .channel_offset = 0x1c, 302 .channel_offset = 0x1c,
319 .timer_bit = 2, 303 .timer_bit = 2,
320 .clk = "tmu0",
321}; 304};
322 305
323static struct resource tmu2_resources[] = { 306static struct resource tmu2_resources[] = {
324 [0] = { 307 [0] = {
325 .name = "TMU2",
326 .start = 0xffd80020, 308 .start = 0xffd80020,
327 .end = 0xffd8002b, 309 .end = 0xffd8002b,
328 .flags = IORESOURCE_MEM, 310 .flags = IORESOURCE_MEM,
@@ -347,15 +329,12 @@ static struct platform_device tmu2_device = {
347}; 329};
348 330
349static struct sh_timer_config tmu3_platform_data = { 331static struct sh_timer_config tmu3_platform_data = {
350 .name = "TMU3",
351 .channel_offset = 0x04, 332 .channel_offset = 0x04,
352 .timer_bit = 0, 333 .timer_bit = 0,
353 .clk = "tmu1",
354}; 334};
355 335
356static struct resource tmu3_resources[] = { 336static struct resource tmu3_resources[] = {
357 [0] = { 337 [0] = {
358 .name = "TMU3",
359 .start = 0xffd90008, 338 .start = 0xffd90008,
360 .end = 0xffd90013, 339 .end = 0xffd90013,
361 .flags = IORESOURCE_MEM, 340 .flags = IORESOURCE_MEM,
@@ -380,15 +359,12 @@ static struct platform_device tmu3_device = {
380}; 359};
381 360
382static struct sh_timer_config tmu4_platform_data = { 361static struct sh_timer_config tmu4_platform_data = {
383 .name = "TMU4",
384 .channel_offset = 0x10, 362 .channel_offset = 0x10,
385 .timer_bit = 1, 363 .timer_bit = 1,
386 .clk = "tmu1",
387}; 364};
388 365
389static struct resource tmu4_resources[] = { 366static struct resource tmu4_resources[] = {
390 [0] = { 367 [0] = {
391 .name = "TMU4",
392 .start = 0xffd90014, 368 .start = 0xffd90014,
393 .end = 0xffd9001f, 369 .end = 0xffd9001f,
394 .flags = IORESOURCE_MEM, 370 .flags = IORESOURCE_MEM,
@@ -413,15 +389,12 @@ static struct platform_device tmu4_device = {
413}; 389};
414 390
415static struct sh_timer_config tmu5_platform_data = { 391static struct sh_timer_config tmu5_platform_data = {
416 .name = "TMU5",
417 .channel_offset = 0x1c, 392 .channel_offset = 0x1c,
418 .timer_bit = 2, 393 .timer_bit = 2,
419 .clk = "tmu1",
420}; 394};
421 395
422static struct resource tmu5_resources[] = { 396static struct resource tmu5_resources[] = {
423 [0] = { 397 [0] = {
424 .name = "TMU5",
425 .start = 0xffd90020, 398 .start = 0xffd90020,
426 .end = 0xffd9002b, 399 .end = 0xffd9002b,
427 .flags = IORESOURCE_MEM, 400 .flags = IORESOURCE_MEM,
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
index e7fa2a92fc1f..8a0a4a99f86b 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
@@ -213,7 +213,6 @@ static struct plat_sci_port scif0_platform_data = {
213 .flags = UPF_BOOT_AUTOCONF, 213 .flags = UPF_BOOT_AUTOCONF,
214 .type = PORT_SCIF, 214 .type = PORT_SCIF,
215 .irqs = { 80, 80, 80, 80 }, 215 .irqs = { 80, 80, 80, 80 },
216 .clk = "scif0",
217}; 216};
218 217
219static struct platform_device scif0_device = { 218static struct platform_device scif0_device = {
@@ -229,7 +228,6 @@ static struct plat_sci_port scif1_platform_data = {
229 .flags = UPF_BOOT_AUTOCONF, 228 .flags = UPF_BOOT_AUTOCONF,
230 .type = PORT_SCIF, 229 .type = PORT_SCIF,
231 .irqs = { 81, 81, 81, 81 }, 230 .irqs = { 81, 81, 81, 81 },
232 .clk = "scif1",
233}; 231};
234 232
235static struct platform_device scif1_device = { 233static struct platform_device scif1_device = {
@@ -245,7 +243,6 @@ static struct plat_sci_port scif2_platform_data = {
245 .flags = UPF_BOOT_AUTOCONF, 243 .flags = UPF_BOOT_AUTOCONF,
246 .type = PORT_SCIF, 244 .type = PORT_SCIF,
247 .irqs = { 82, 82, 82, 82 }, 245 .irqs = { 82, 82, 82, 82 },
248 .clk = "scif2",
249}; 246};
250 247
251static struct platform_device scif2_device = { 248static struct platform_device scif2_device = {
@@ -261,7 +258,6 @@ static struct plat_sci_port scif3_platform_data = {
261 .flags = UPF_BOOT_AUTOCONF, 258 .flags = UPF_BOOT_AUTOCONF,
262 .type = PORT_SCIFA, 259 .type = PORT_SCIFA,
263 .irqs = { 56, 56, 56, 56 }, 260 .irqs = { 56, 56, 56, 56 },
264 .clk = "scif3",
265}; 261};
266 262
267static struct platform_device scif3_device = { 263static struct platform_device scif3_device = {
@@ -277,7 +273,6 @@ static struct plat_sci_port scif4_platform_data = {
277 .flags = UPF_BOOT_AUTOCONF, 273 .flags = UPF_BOOT_AUTOCONF,
278 .type = PORT_SCIFA, 274 .type = PORT_SCIFA,
279 .irqs = { 88, 88, 88, 88 }, 275 .irqs = { 88, 88, 88, 88 },
280 .clk = "scif4",
281}; 276};
282 277
283static struct platform_device scif4_device = { 278static struct platform_device scif4_device = {
@@ -293,7 +288,6 @@ static struct plat_sci_port scif5_platform_data = {
293 .flags = UPF_BOOT_AUTOCONF, 288 .flags = UPF_BOOT_AUTOCONF,
294 .type = PORT_SCIFA, 289 .type = PORT_SCIFA,
295 .irqs = { 109, 109, 109, 109 }, 290 .irqs = { 109, 109, 109, 109 },
296 .clk = "scif5",
297}; 291};
298 292
299static struct platform_device scif5_device = { 293static struct platform_device scif5_device = {
@@ -485,17 +479,14 @@ static struct platform_device veu1_device = {
485}; 479};
486 480
487static struct sh_timer_config cmt_platform_data = { 481static struct sh_timer_config cmt_platform_data = {
488 .name = "CMT",
489 .channel_offset = 0x60, 482 .channel_offset = 0x60,
490 .timer_bit = 5, 483 .timer_bit = 5,
491 .clk = "cmt0",
492 .clockevent_rating = 125, 484 .clockevent_rating = 125,
493 .clocksource_rating = 200, 485 .clocksource_rating = 200,
494}; 486};
495 487
496static struct resource cmt_resources[] = { 488static struct resource cmt_resources[] = {
497 [0] = { 489 [0] = {
498 .name = "CMT",
499 .start = 0x044a0060, 490 .start = 0x044a0060,
500 .end = 0x044a006b, 491 .end = 0x044a006b,
501 .flags = IORESOURCE_MEM, 492 .flags = IORESOURCE_MEM,
@@ -520,16 +511,13 @@ static struct platform_device cmt_device = {
520}; 511};
521 512
522static struct sh_timer_config tmu0_platform_data = { 513static struct sh_timer_config tmu0_platform_data = {
523 .name = "TMU0",
524 .channel_offset = 0x04, 514 .channel_offset = 0x04,
525 .timer_bit = 0, 515 .timer_bit = 0,
526 .clk = "tmu0",
527 .clockevent_rating = 200, 516 .clockevent_rating = 200,
528}; 517};
529 518
530static struct resource tmu0_resources[] = { 519static struct resource tmu0_resources[] = {
531 [0] = { 520 [0] = {
532 .name = "TMU0",
533 .start = 0xffd80008, 521 .start = 0xffd80008,
534 .end = 0xffd80013, 522 .end = 0xffd80013,
535 .flags = IORESOURCE_MEM, 523 .flags = IORESOURCE_MEM,
@@ -554,16 +542,13 @@ static struct platform_device tmu0_device = {
554}; 542};
555 543
556static struct sh_timer_config tmu1_platform_data = { 544static struct sh_timer_config tmu1_platform_data = {
557 .name = "TMU1",
558 .channel_offset = 0x10, 545 .channel_offset = 0x10,
559 .timer_bit = 1, 546 .timer_bit = 1,
560 .clk = "tmu0",
561 .clocksource_rating = 200, 547 .clocksource_rating = 200,
562}; 548};
563 549
564static struct resource tmu1_resources[] = { 550static struct resource tmu1_resources[] = {
565 [0] = { 551 [0] = {
566 .name = "TMU1",
567 .start = 0xffd80014, 552 .start = 0xffd80014,
568 .end = 0xffd8001f, 553 .end = 0xffd8001f,
569 .flags = IORESOURCE_MEM, 554 .flags = IORESOURCE_MEM,
@@ -588,15 +573,12 @@ static struct platform_device tmu1_device = {
588}; 573};
589 574
590static struct sh_timer_config tmu2_platform_data = { 575static struct sh_timer_config tmu2_platform_data = {
591 .name = "TMU2",
592 .channel_offset = 0x1c, 576 .channel_offset = 0x1c,
593 .timer_bit = 2, 577 .timer_bit = 2,
594 .clk = "tmu0",
595}; 578};
596 579
597static struct resource tmu2_resources[] = { 580static struct resource tmu2_resources[] = {
598 [0] = { 581 [0] = {
599 .name = "TMU2",
600 .start = 0xffd80020, 582 .start = 0xffd80020,
601 .end = 0xffd8002b, 583 .end = 0xffd8002b,
602 .flags = IORESOURCE_MEM, 584 .flags = IORESOURCE_MEM,
@@ -622,15 +604,12 @@ static struct platform_device tmu2_device = {
622 604
623 605
624static struct sh_timer_config tmu3_platform_data = { 606static struct sh_timer_config tmu3_platform_data = {
625 .name = "TMU3",
626 .channel_offset = 0x04, 607 .channel_offset = 0x04,
627 .timer_bit = 0, 608 .timer_bit = 0,
628 .clk = "tmu1",
629}; 609};
630 610
631static struct resource tmu3_resources[] = { 611static struct resource tmu3_resources[] = {
632 [0] = { 612 [0] = {
633 .name = "TMU3",
634 .start = 0xffd90008, 613 .start = 0xffd90008,
635 .end = 0xffd90013, 614 .end = 0xffd90013,
636 .flags = IORESOURCE_MEM, 615 .flags = IORESOURCE_MEM,
@@ -655,15 +634,12 @@ static struct platform_device tmu3_device = {
655}; 634};
656 635
657static struct sh_timer_config tmu4_platform_data = { 636static struct sh_timer_config tmu4_platform_data = {
658 .name = "TMU4",
659 .channel_offset = 0x10, 637 .channel_offset = 0x10,
660 .timer_bit = 1, 638 .timer_bit = 1,
661 .clk = "tmu1",
662}; 639};
663 640
664static struct resource tmu4_resources[] = { 641static struct resource tmu4_resources[] = {
665 [0] = { 642 [0] = {
666 .name = "TMU4",
667 .start = 0xffd90014, 643 .start = 0xffd90014,
668 .end = 0xffd9001f, 644 .end = 0xffd9001f,
669 .flags = IORESOURCE_MEM, 645 .flags = IORESOURCE_MEM,
@@ -688,15 +664,12 @@ static struct platform_device tmu4_device = {
688}; 664};
689 665
690static struct sh_timer_config tmu5_platform_data = { 666static struct sh_timer_config tmu5_platform_data = {
691 .name = "TMU5",
692 .channel_offset = 0x1c, 667 .channel_offset = 0x1c,
693 .timer_bit = 2, 668 .timer_bit = 2,
694 .clk = "tmu1",
695}; 669};
696 670
697static struct resource tmu5_resources[] = { 671static struct resource tmu5_resources[] = {
698 [0] = { 672 [0] = {
699 .name = "TMU5",
700 .start = 0xffd90020, 673 .start = 0xffd90020,
701 .end = 0xffd9002b, 674 .end = 0xffd9002b,
702 .flags = IORESOURCE_MEM, 675 .flags = IORESOURCE_MEM,
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7757.c b/arch/sh/kernel/cpu/sh4a/setup-sh7757.c
index e75edf58796a..444aca95b20d 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7757.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7757.c
@@ -63,16 +63,13 @@ static struct platform_device scif4_device = {
63}; 63};
64 64
65static struct sh_timer_config tmu0_platform_data = { 65static struct sh_timer_config tmu0_platform_data = {
66 .name = "TMU0",
67 .channel_offset = 0x04, 66 .channel_offset = 0x04,
68 .timer_bit = 0, 67 .timer_bit = 0,
69 .clk = "peripheral_clk",
70 .clockevent_rating = 200, 68 .clockevent_rating = 200,
71}; 69};
72 70
73static struct resource tmu0_resources[] = { 71static struct resource tmu0_resources[] = {
74 [0] = { 72 [0] = {
75 .name = "TMU0",
76 .start = 0xfe430008, 73 .start = 0xfe430008,
77 .end = 0xfe430013, 74 .end = 0xfe430013,
78 .flags = IORESOURCE_MEM, 75 .flags = IORESOURCE_MEM,
@@ -94,16 +91,13 @@ static struct platform_device tmu0_device = {
94}; 91};
95 92
96static struct sh_timer_config tmu1_platform_data = { 93static struct sh_timer_config tmu1_platform_data = {
97 .name = "TMU1",
98 .channel_offset = 0x10, 94 .channel_offset = 0x10,
99 .timer_bit = 1, 95 .timer_bit = 1,
100 .clk = "peripheral_clk",
101 .clocksource_rating = 200, 96 .clocksource_rating = 200,
102}; 97};
103 98
104static struct resource tmu1_resources[] = { 99static struct resource tmu1_resources[] = {
105 [0] = { 100 [0] = {
106 .name = "TMU1",
107 .start = 0xfe430014, 101 .start = 0xfe430014,
108 .end = 0xfe43001f, 102 .end = 0xfe43001f,
109 .flags = IORESOURCE_MEM, 103 .flags = IORESOURCE_MEM,
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7763.c b/arch/sh/kernel/cpu/sh4a/setup-sh7763.c
index 7f6b0a5f7f82..5b5f6b005fc5 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7763.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7763.c
@@ -131,16 +131,13 @@ static struct platform_device usbf_device = {
131}; 131};
132 132
133static struct sh_timer_config tmu0_platform_data = { 133static struct sh_timer_config tmu0_platform_data = {
134 .name = "TMU0",
135 .channel_offset = 0x04, 134 .channel_offset = 0x04,
136 .timer_bit = 0, 135 .timer_bit = 0,
137 .clk = "peripheral_clk",
138 .clockevent_rating = 200, 136 .clockevent_rating = 200,
139}; 137};
140 138
141static struct resource tmu0_resources[] = { 139static struct resource tmu0_resources[] = {
142 [0] = { 140 [0] = {
143 .name = "TMU0",
144 .start = 0xffd80008, 141 .start = 0xffd80008,
145 .end = 0xffd80013, 142 .end = 0xffd80013,
146 .flags = IORESOURCE_MEM, 143 .flags = IORESOURCE_MEM,
@@ -162,16 +159,13 @@ static struct platform_device tmu0_device = {
162}; 159};
163 160
164static struct sh_timer_config tmu1_platform_data = { 161static struct sh_timer_config tmu1_platform_data = {
165 .name = "TMU1",
166 .channel_offset = 0x10, 162 .channel_offset = 0x10,
167 .timer_bit = 1, 163 .timer_bit = 1,
168 .clk = "peripheral_clk",
169 .clocksource_rating = 200, 164 .clocksource_rating = 200,
170}; 165};
171 166
172static struct resource tmu1_resources[] = { 167static struct resource tmu1_resources[] = {
173 [0] = { 168 [0] = {
174 .name = "TMU1",
175 .start = 0xffd80014, 169 .start = 0xffd80014,
176 .end = 0xffd8001f, 170 .end = 0xffd8001f,
177 .flags = IORESOURCE_MEM, 171 .flags = IORESOURCE_MEM,
@@ -193,15 +187,12 @@ static struct platform_device tmu1_device = {
193}; 187};
194 188
195static struct sh_timer_config tmu2_platform_data = { 189static struct sh_timer_config tmu2_platform_data = {
196 .name = "TMU2",
197 .channel_offset = 0x1c, 190 .channel_offset = 0x1c,
198 .timer_bit = 2, 191 .timer_bit = 2,
199 .clk = "peripheral_clk",
200}; 192};
201 193
202static struct resource tmu2_resources[] = { 194static struct resource tmu2_resources[] = {
203 [0] = { 195 [0] = {
204 .name = "TMU2",
205 .start = 0xffd80020, 196 .start = 0xffd80020,
206 .end = 0xffd8002f, 197 .end = 0xffd8002f,
207 .flags = IORESOURCE_MEM, 198 .flags = IORESOURCE_MEM,
@@ -223,15 +214,12 @@ static struct platform_device tmu2_device = {
223}; 214};
224 215
225static struct sh_timer_config tmu3_platform_data = { 216static struct sh_timer_config tmu3_platform_data = {
226 .name = "TMU3",
227 .channel_offset = 0x04, 217 .channel_offset = 0x04,
228 .timer_bit = 0, 218 .timer_bit = 0,
229 .clk = "peripheral_clk",
230}; 219};
231 220
232static struct resource tmu3_resources[] = { 221static struct resource tmu3_resources[] = {
233 [0] = { 222 [0] = {
234 .name = "TMU3",
235 .start = 0xffd88008, 223 .start = 0xffd88008,
236 .end = 0xffd88013, 224 .end = 0xffd88013,
237 .flags = IORESOURCE_MEM, 225 .flags = IORESOURCE_MEM,
@@ -253,15 +241,12 @@ static struct platform_device tmu3_device = {
253}; 241};
254 242
255static struct sh_timer_config tmu4_platform_data = { 243static struct sh_timer_config tmu4_platform_data = {
256 .name = "TMU4",
257 .channel_offset = 0x10, 244 .channel_offset = 0x10,
258 .timer_bit = 1, 245 .timer_bit = 1,
259 .clk = "peripheral_clk",
260}; 246};
261 247
262static struct resource tmu4_resources[] = { 248static struct resource tmu4_resources[] = {
263 [0] = { 249 [0] = {
264 .name = "TMU4",
265 .start = 0xffd88014, 250 .start = 0xffd88014,
266 .end = 0xffd8801f, 251 .end = 0xffd8801f,
267 .flags = IORESOURCE_MEM, 252 .flags = IORESOURCE_MEM,
@@ -283,15 +268,12 @@ static struct platform_device tmu4_device = {
283}; 268};
284 269
285static struct sh_timer_config tmu5_platform_data = { 270static struct sh_timer_config tmu5_platform_data = {
286 .name = "TMU5",
287 .channel_offset = 0x1c, 271 .channel_offset = 0x1c,
288 .timer_bit = 2, 272 .timer_bit = 2,
289 .clk = "peripheral_clk",
290}; 273};
291 274
292static struct resource tmu5_resources[] = { 275static struct resource tmu5_resources[] = {
293 [0] = { 276 [0] = {
294 .name = "TMU5",
295 .start = 0xffd88020, 277 .start = 0xffd88020,
296 .end = 0xffd8802b, 278 .end = 0xffd8802b,
297 .flags = IORESOURCE_MEM, 279 .flags = IORESOURCE_MEM,
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7770.c b/arch/sh/kernel/cpu/sh4a/setup-sh7770.c
index 86d681ecf90e..7270d7fd6761 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7770.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7770.c
@@ -165,16 +165,13 @@ static struct platform_device scif9_device = {
165}; 165};
166 166
167static struct sh_timer_config tmu0_platform_data = { 167static struct sh_timer_config tmu0_platform_data = {
168 .name = "TMU0",
169 .channel_offset = 0x04, 168 .channel_offset = 0x04,
170 .timer_bit = 0, 169 .timer_bit = 0,
171 .clk = "peripheral_clk",
172 .clockevent_rating = 200, 170 .clockevent_rating = 200,
173}; 171};
174 172
175static struct resource tmu0_resources[] = { 173static struct resource tmu0_resources[] = {
176 [0] = { 174 [0] = {
177 .name = "TMU0",
178 .start = 0xffd80008, 175 .start = 0xffd80008,
179 .end = 0xffd80013, 176 .end = 0xffd80013,
180 .flags = IORESOURCE_MEM, 177 .flags = IORESOURCE_MEM,
@@ -196,16 +193,13 @@ static struct platform_device tmu0_device = {
196}; 193};
197 194
198static struct sh_timer_config tmu1_platform_data = { 195static struct sh_timer_config tmu1_platform_data = {
199 .name = "TMU1",
200 .channel_offset = 0x10, 196 .channel_offset = 0x10,
201 .timer_bit = 1, 197 .timer_bit = 1,
202 .clk = "peripheral_clk",
203 .clocksource_rating = 200, 198 .clocksource_rating = 200,
204}; 199};
205 200
206static struct resource tmu1_resources[] = { 201static struct resource tmu1_resources[] = {
207 [0] = { 202 [0] = {
208 .name = "TMU1",
209 .start = 0xffd80014, 203 .start = 0xffd80014,
210 .end = 0xffd8001f, 204 .end = 0xffd8001f,
211 .flags = IORESOURCE_MEM, 205 .flags = IORESOURCE_MEM,
@@ -227,15 +221,12 @@ static struct platform_device tmu1_device = {
227}; 221};
228 222
229static struct sh_timer_config tmu2_platform_data = { 223static struct sh_timer_config tmu2_platform_data = {
230 .name = "TMU2",
231 .channel_offset = 0x1c, 224 .channel_offset = 0x1c,
232 .timer_bit = 2, 225 .timer_bit = 2,
233 .clk = "peripheral_clk",
234}; 226};
235 227
236static struct resource tmu2_resources[] = { 228static struct resource tmu2_resources[] = {
237 [0] = { 229 [0] = {
238 .name = "TMU2",
239 .start = 0xffd80020, 230 .start = 0xffd80020,
240 .end = 0xffd8002f, 231 .end = 0xffd8002f,
241 .flags = IORESOURCE_MEM, 232 .flags = IORESOURCE_MEM,
@@ -257,15 +248,12 @@ static struct platform_device tmu2_device = {
257}; 248};
258 249
259static struct sh_timer_config tmu3_platform_data = { 250static struct sh_timer_config tmu3_platform_data = {
260 .name = "TMU3",
261 .channel_offset = 0x04, 251 .channel_offset = 0x04,
262 .timer_bit = 0, 252 .timer_bit = 0,
263 .clk = "peripheral_clk",
264}; 253};
265 254
266static struct resource tmu3_resources[] = { 255static struct resource tmu3_resources[] = {
267 [0] = { 256 [0] = {
268 .name = "TMU3",
269 .start = 0xffd81008, 257 .start = 0xffd81008,
270 .end = 0xffd81013, 258 .end = 0xffd81013,
271 .flags = IORESOURCE_MEM, 259 .flags = IORESOURCE_MEM,
@@ -287,15 +275,12 @@ static struct platform_device tmu3_device = {
287}; 275};
288 276
289static struct sh_timer_config tmu4_platform_data = { 277static struct sh_timer_config tmu4_platform_data = {
290 .name = "TMU4",
291 .channel_offset = 0x10, 278 .channel_offset = 0x10,
292 .timer_bit = 1, 279 .timer_bit = 1,
293 .clk = "peripheral_clk",
294}; 280};
295 281
296static struct resource tmu4_resources[] = { 282static struct resource tmu4_resources[] = {
297 [0] = { 283 [0] = {
298 .name = "TMU4",
299 .start = 0xffd81014, 284 .start = 0xffd81014,
300 .end = 0xffd8101f, 285 .end = 0xffd8101f,
301 .flags = IORESOURCE_MEM, 286 .flags = IORESOURCE_MEM,
@@ -317,15 +302,12 @@ static struct platform_device tmu4_device = {
317}; 302};
318 303
319static struct sh_timer_config tmu5_platform_data = { 304static struct sh_timer_config tmu5_platform_data = {
320 .name = "TMU5",
321 .channel_offset = 0x1c, 305 .channel_offset = 0x1c,
322 .timer_bit = 2, 306 .timer_bit = 2,
323 .clk = "peripheral_clk",
324}; 307};
325 308
326static struct resource tmu5_resources[] = { 309static struct resource tmu5_resources[] = {
327 [0] = { 310 [0] = {
328 .name = "TMU5",
329 .start = 0xffd81020, 311 .start = 0xffd81020,
330 .end = 0xffd8102f, 312 .end = 0xffd8102f,
331 .flags = IORESOURCE_MEM, 313 .flags = IORESOURCE_MEM,
@@ -347,15 +329,12 @@ static struct platform_device tmu5_device = {
347}; 329};
348 330
349static struct sh_timer_config tmu6_platform_data = { 331static struct sh_timer_config tmu6_platform_data = {
350 .name = "TMU6",
351 .channel_offset = 0x04, 332 .channel_offset = 0x04,
352 .timer_bit = 0, 333 .timer_bit = 0,
353 .clk = "peripheral_clk",
354}; 334};
355 335
356static struct resource tmu6_resources[] = { 336static struct resource tmu6_resources[] = {
357 [0] = { 337 [0] = {
358 .name = "TMU6",
359 .start = 0xffd82008, 338 .start = 0xffd82008,
360 .end = 0xffd82013, 339 .end = 0xffd82013,
361 .flags = IORESOURCE_MEM, 340 .flags = IORESOURCE_MEM,
@@ -377,15 +356,12 @@ static struct platform_device tmu6_device = {
377}; 356};
378 357
379static struct sh_timer_config tmu7_platform_data = { 358static struct sh_timer_config tmu7_platform_data = {
380 .name = "TMU7",
381 .channel_offset = 0x10, 359 .channel_offset = 0x10,
382 .timer_bit = 1, 360 .timer_bit = 1,
383 .clk = "peripheral_clk",
384}; 361};
385 362
386static struct resource tmu7_resources[] = { 363static struct resource tmu7_resources[] = {
387 [0] = { 364 [0] = {
388 .name = "TMU7",
389 .start = 0xffd82014, 365 .start = 0xffd82014,
390 .end = 0xffd8201f, 366 .end = 0xffd8201f,
391 .flags = IORESOURCE_MEM, 367 .flags = IORESOURCE_MEM,
@@ -407,15 +383,12 @@ static struct platform_device tmu7_device = {
407}; 383};
408 384
409static struct sh_timer_config tmu8_platform_data = { 385static struct sh_timer_config tmu8_platform_data = {
410 .name = "TMU8",
411 .channel_offset = 0x1c, 386 .channel_offset = 0x1c,
412 .timer_bit = 2, 387 .timer_bit = 2,
413 .clk = "peripheral_clk",
414}; 388};
415 389
416static struct resource tmu8_resources[] = { 390static struct resource tmu8_resources[] = {
417 [0] = { 391 [0] = {
418 .name = "TMU8",
419 .start = 0xffd82020, 392 .start = 0xffd82020,
420 .end = 0xffd8202b, 393 .end = 0xffd8202b,
421 .flags = IORESOURCE_MEM, 394 .flags = IORESOURCE_MEM,
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7780.c b/arch/sh/kernel/cpu/sh4a/setup-sh7780.c
index 02e792c90de6..05fc38df1582 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7780.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7780.c
@@ -49,16 +49,13 @@ static struct platform_device scif1_device = {
49}; 49};
50 50
51static struct sh_timer_config tmu0_platform_data = { 51static struct sh_timer_config tmu0_platform_data = {
52 .name = "TMU0",
53 .channel_offset = 0x04, 52 .channel_offset = 0x04,
54 .timer_bit = 0, 53 .timer_bit = 0,
55 .clk = "peripheral_clk",
56 .clockevent_rating = 200, 54 .clockevent_rating = 200,
57}; 55};
58 56
59static struct resource tmu0_resources[] = { 57static struct resource tmu0_resources[] = {
60 [0] = { 58 [0] = {
61 .name = "TMU0",
62 .start = 0xffd80008, 59 .start = 0xffd80008,
63 .end = 0xffd80013, 60 .end = 0xffd80013,
64 .flags = IORESOURCE_MEM, 61 .flags = IORESOURCE_MEM,
@@ -80,16 +77,13 @@ static struct platform_device tmu0_device = {
80}; 77};
81 78
82static struct sh_timer_config tmu1_platform_data = { 79static struct sh_timer_config tmu1_platform_data = {
83 .name = "TMU1",
84 .channel_offset = 0x10, 80 .channel_offset = 0x10,
85 .timer_bit = 1, 81 .timer_bit = 1,
86 .clk = "peripheral_clk",
87 .clocksource_rating = 200, 82 .clocksource_rating = 200,
88}; 83};
89 84
90static struct resource tmu1_resources[] = { 85static struct resource tmu1_resources[] = {
91 [0] = { 86 [0] = {
92 .name = "TMU1",
93 .start = 0xffd80014, 87 .start = 0xffd80014,
94 .end = 0xffd8001f, 88 .end = 0xffd8001f,
95 .flags = IORESOURCE_MEM, 89 .flags = IORESOURCE_MEM,
@@ -111,15 +105,12 @@ static struct platform_device tmu1_device = {
111}; 105};
112 106
113static struct sh_timer_config tmu2_platform_data = { 107static struct sh_timer_config tmu2_platform_data = {
114 .name = "TMU2",
115 .channel_offset = 0x1c, 108 .channel_offset = 0x1c,
116 .timer_bit = 2, 109 .timer_bit = 2,
117 .clk = "peripheral_clk",
118}; 110};
119 111
120static struct resource tmu2_resources[] = { 112static struct resource tmu2_resources[] = {
121 [0] = { 113 [0] = {
122 .name = "TMU2",
123 .start = 0xffd80020, 114 .start = 0xffd80020,
124 .end = 0xffd8002f, 115 .end = 0xffd8002f,
125 .flags = IORESOURCE_MEM, 116 .flags = IORESOURCE_MEM,
@@ -141,15 +132,12 @@ static struct platform_device tmu2_device = {
141}; 132};
142 133
143static struct sh_timer_config tmu3_platform_data = { 134static struct sh_timer_config tmu3_platform_data = {
144 .name = "TMU3",
145 .channel_offset = 0x04, 135 .channel_offset = 0x04,
146 .timer_bit = 0, 136 .timer_bit = 0,
147 .clk = "peripheral_clk",
148}; 137};
149 138
150static struct resource tmu3_resources[] = { 139static struct resource tmu3_resources[] = {
151 [0] = { 140 [0] = {
152 .name = "TMU3",
153 .start = 0xffdc0008, 141 .start = 0xffdc0008,
154 .end = 0xffdc0013, 142 .end = 0xffdc0013,
155 .flags = IORESOURCE_MEM, 143 .flags = IORESOURCE_MEM,
@@ -171,15 +159,12 @@ static struct platform_device tmu3_device = {
171}; 159};
172 160
173static struct sh_timer_config tmu4_platform_data = { 161static struct sh_timer_config tmu4_platform_data = {
174 .name = "TMU4",
175 .channel_offset = 0x10, 162 .channel_offset = 0x10,
176 .timer_bit = 1, 163 .timer_bit = 1,
177 .clk = "peripheral_clk",
178}; 164};
179 165
180static struct resource tmu4_resources[] = { 166static struct resource tmu4_resources[] = {
181 [0] = { 167 [0] = {
182 .name = "TMU4",
183 .start = 0xffdc0014, 168 .start = 0xffdc0014,
184 .end = 0xffdc001f, 169 .end = 0xffdc001f,
185 .flags = IORESOURCE_MEM, 170 .flags = IORESOURCE_MEM,
@@ -201,15 +186,12 @@ static struct platform_device tmu4_device = {
201}; 186};
202 187
203static struct sh_timer_config tmu5_platform_data = { 188static struct sh_timer_config tmu5_platform_data = {
204 .name = "TMU5",
205 .channel_offset = 0x1c, 189 .channel_offset = 0x1c,
206 .timer_bit = 2, 190 .timer_bit = 2,
207 .clk = "peripheral_clk",
208}; 191};
209 192
210static struct resource tmu5_resources[] = { 193static struct resource tmu5_resources[] = {
211 [0] = { 194 [0] = {
212 .name = "TMU5",
213 .start = 0xffdc0020, 195 .start = 0xffdc0020,
214 .end = 0xffdc002b, 196 .end = 0xffdc002b,
215 .flags = IORESOURCE_MEM, 197 .flags = IORESOURCE_MEM,
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7785.c b/arch/sh/kernel/cpu/sh4a/setup-sh7785.c
index 1fcd88b1671e..07bb2d4619f8 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7785.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7785.c
@@ -25,7 +25,6 @@ static struct plat_sci_port scif0_platform_data = {
25 .flags = UPF_BOOT_AUTOCONF, 25 .flags = UPF_BOOT_AUTOCONF,
26 .type = PORT_SCIF, 26 .type = PORT_SCIF,
27 .irqs = { 40, 40, 40, 40 }, 27 .irqs = { 40, 40, 40, 40 },
28 .clk = "scif_fck",
29}; 28};
30 29
31static struct platform_device scif0_device = { 30static struct platform_device scif0_device = {
@@ -41,7 +40,6 @@ static struct plat_sci_port scif1_platform_data = {
41 .flags = UPF_BOOT_AUTOCONF, 40 .flags = UPF_BOOT_AUTOCONF,
42 .type = PORT_SCIF, 41 .type = PORT_SCIF,
43 .irqs = { 44, 44, 44, 44 }, 42 .irqs = { 44, 44, 44, 44 },
44 .clk = "scif_fck",
45}; 43};
46 44
47static struct platform_device scif1_device = { 45static struct platform_device scif1_device = {
@@ -57,7 +55,6 @@ static struct plat_sci_port scif2_platform_data = {
57 .flags = UPF_BOOT_AUTOCONF, 55 .flags = UPF_BOOT_AUTOCONF,
58 .type = PORT_SCIF, 56 .type = PORT_SCIF,
59 .irqs = { 60, 60, 60, 60 }, 57 .irqs = { 60, 60, 60, 60 },
60 .clk = "scif_fck",
61}; 58};
62 59
63static struct platform_device scif2_device = { 60static struct platform_device scif2_device = {
@@ -73,7 +70,6 @@ static struct plat_sci_port scif3_platform_data = {
73 .flags = UPF_BOOT_AUTOCONF, 70 .flags = UPF_BOOT_AUTOCONF,
74 .type = PORT_SCIF, 71 .type = PORT_SCIF,
75 .irqs = { 61, 61, 61, 61 }, 72 .irqs = { 61, 61, 61, 61 },
76 .clk = "scif_fck",
77}; 73};
78 74
79static struct platform_device scif3_device = { 75static struct platform_device scif3_device = {
@@ -89,7 +85,6 @@ static struct plat_sci_port scif4_platform_data = {
89 .flags = UPF_BOOT_AUTOCONF, 85 .flags = UPF_BOOT_AUTOCONF,
90 .type = PORT_SCIF, 86 .type = PORT_SCIF,
91 .irqs = { 62, 62, 62, 62 }, 87 .irqs = { 62, 62, 62, 62 },
92 .clk = "scif_fck",
93}; 88};
94 89
95static struct platform_device scif4_device = { 90static struct platform_device scif4_device = {
@@ -105,7 +100,6 @@ static struct plat_sci_port scif5_platform_data = {
105 .flags = UPF_BOOT_AUTOCONF, 100 .flags = UPF_BOOT_AUTOCONF,
106 .type = PORT_SCIF, 101 .type = PORT_SCIF,
107 .irqs = { 63, 63, 63, 63 }, 102 .irqs = { 63, 63, 63, 63 },
108 .clk = "scif_fck",
109}; 103};
110 104
111static struct platform_device scif5_device = { 105static struct platform_device scif5_device = {
@@ -117,16 +111,13 @@ static struct platform_device scif5_device = {
117}; 111};
118 112
119static struct sh_timer_config tmu0_platform_data = { 113static struct sh_timer_config tmu0_platform_data = {
120 .name = "TMU0",
121 .channel_offset = 0x04, 114 .channel_offset = 0x04,
122 .timer_bit = 0, 115 .timer_bit = 0,
123 .clk = "tmu012_fck",
124 .clockevent_rating = 200, 116 .clockevent_rating = 200,
125}; 117};
126 118
127static struct resource tmu0_resources[] = { 119static struct resource tmu0_resources[] = {
128 [0] = { 120 [0] = {
129 .name = "TMU0",
130 .start = 0xffd80008, 121 .start = 0xffd80008,
131 .end = 0xffd80013, 122 .end = 0xffd80013,
132 .flags = IORESOURCE_MEM, 123 .flags = IORESOURCE_MEM,
@@ -148,16 +139,13 @@ static struct platform_device tmu0_device = {
148}; 139};
149 140
150static struct sh_timer_config tmu1_platform_data = { 141static struct sh_timer_config tmu1_platform_data = {
151 .name = "TMU1",
152 .channel_offset = 0x10, 142 .channel_offset = 0x10,
153 .timer_bit = 1, 143 .timer_bit = 1,
154 .clk = "tmu012_fck",
155 .clocksource_rating = 200, 144 .clocksource_rating = 200,
156}; 145};
157 146
158static struct resource tmu1_resources[] = { 147static struct resource tmu1_resources[] = {
159 [0] = { 148 [0] = {
160 .name = "TMU1",
161 .start = 0xffd80014, 149 .start = 0xffd80014,
162 .end = 0xffd8001f, 150 .end = 0xffd8001f,
163 .flags = IORESOURCE_MEM, 151 .flags = IORESOURCE_MEM,
@@ -179,15 +167,12 @@ static struct platform_device tmu1_device = {
179}; 167};
180 168
181static struct sh_timer_config tmu2_platform_data = { 169static struct sh_timer_config tmu2_platform_data = {
182 .name = "TMU2",
183 .channel_offset = 0x1c, 170 .channel_offset = 0x1c,
184 .timer_bit = 2, 171 .timer_bit = 2,
185 .clk = "tmu012_fck",
186}; 172};
187 173
188static struct resource tmu2_resources[] = { 174static struct resource tmu2_resources[] = {
189 [0] = { 175 [0] = {
190 .name = "TMU2",
191 .start = 0xffd80020, 176 .start = 0xffd80020,
192 .end = 0xffd8002f, 177 .end = 0xffd8002f,
193 .flags = IORESOURCE_MEM, 178 .flags = IORESOURCE_MEM,
@@ -209,15 +194,12 @@ static struct platform_device tmu2_device = {
209}; 194};
210 195
211static struct sh_timer_config tmu3_platform_data = { 196static struct sh_timer_config tmu3_platform_data = {
212 .name = "TMU3",
213 .channel_offset = 0x04, 197 .channel_offset = 0x04,
214 .timer_bit = 0, 198 .timer_bit = 0,
215 .clk = "tmu345_fck",
216}; 199};
217 200
218static struct resource tmu3_resources[] = { 201static struct resource tmu3_resources[] = {
219 [0] = { 202 [0] = {
220 .name = "TMU3",
221 .start = 0xffdc0008, 203 .start = 0xffdc0008,
222 .end = 0xffdc0013, 204 .end = 0xffdc0013,
223 .flags = IORESOURCE_MEM, 205 .flags = IORESOURCE_MEM,
@@ -239,15 +221,12 @@ static struct platform_device tmu3_device = {
239}; 221};
240 222
241static struct sh_timer_config tmu4_platform_data = { 223static struct sh_timer_config tmu4_platform_data = {
242 .name = "TMU4",
243 .channel_offset = 0x10, 224 .channel_offset = 0x10,
244 .timer_bit = 1, 225 .timer_bit = 1,
245 .clk = "tmu345_fck",
246}; 226};
247 227
248static struct resource tmu4_resources[] = { 228static struct resource tmu4_resources[] = {
249 [0] = { 229 [0] = {
250 .name = "TMU4",
251 .start = 0xffdc0014, 230 .start = 0xffdc0014,
252 .end = 0xffdc001f, 231 .end = 0xffdc001f,
253 .flags = IORESOURCE_MEM, 232 .flags = IORESOURCE_MEM,
@@ -269,15 +248,12 @@ static struct platform_device tmu4_device = {
269}; 248};
270 249
271static struct sh_timer_config tmu5_platform_data = { 250static struct sh_timer_config tmu5_platform_data = {
272 .name = "TMU5",
273 .channel_offset = 0x1c, 251 .channel_offset = 0x1c,
274 .timer_bit = 2, 252 .timer_bit = 2,
275 .clk = "tmu345_fck",
276}; 253};
277 254
278static struct resource tmu5_resources[] = { 255static struct resource tmu5_resources[] = {
279 [0] = { 256 [0] = {
280 .name = "TMU5",
281 .start = 0xffdc0020, 257 .start = 0xffdc0020,
282 .end = 0xffdc002b, 258 .end = 0xffdc002b,
283 .flags = IORESOURCE_MEM, 259 .flags = IORESOURCE_MEM,
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7786.c b/arch/sh/kernel/cpu/sh4a/setup-sh7786.c
index 7e585320710a..f5599907ac3d 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7786.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7786.c
@@ -117,16 +117,13 @@ static struct platform_device scif5_device = {
117}; 117};
118 118
119static struct sh_timer_config tmu0_platform_data = { 119static struct sh_timer_config tmu0_platform_data = {
120 .name = "TMU0",
121 .channel_offset = 0x04, 120 .channel_offset = 0x04,
122 .timer_bit = 0, 121 .timer_bit = 0,
123 .clk = "peripheral_clk",
124 .clockevent_rating = 200, 122 .clockevent_rating = 200,
125}; 123};
126 124
127static struct resource tmu0_resources[] = { 125static struct resource tmu0_resources[] = {
128 [0] = { 126 [0] = {
129 .name = "TMU0",
130 .start = 0xffd80008, 127 .start = 0xffd80008,
131 .end = 0xffd80013, 128 .end = 0xffd80013,
132 .flags = IORESOURCE_MEM, 129 .flags = IORESOURCE_MEM,
@@ -148,16 +145,13 @@ static struct platform_device tmu0_device = {
148}; 145};
149 146
150static struct sh_timer_config tmu1_platform_data = { 147static struct sh_timer_config tmu1_platform_data = {
151 .name = "TMU1",
152 .channel_offset = 0x10, 148 .channel_offset = 0x10,
153 .timer_bit = 1, 149 .timer_bit = 1,
154 .clk = "peripheral_clk",
155 .clocksource_rating = 200, 150 .clocksource_rating = 200,
156}; 151};
157 152
158static struct resource tmu1_resources[] = { 153static struct resource tmu1_resources[] = {
159 [0] = { 154 [0] = {
160 .name = "TMU1",
161 .start = 0xffd80014, 155 .start = 0xffd80014,
162 .end = 0xffd8001f, 156 .end = 0xffd8001f,
163 .flags = IORESOURCE_MEM, 157 .flags = IORESOURCE_MEM,
@@ -179,15 +173,12 @@ static struct platform_device tmu1_device = {
179}; 173};
180 174
181static struct sh_timer_config tmu2_platform_data = { 175static struct sh_timer_config tmu2_platform_data = {
182 .name = "TMU2",
183 .channel_offset = 0x1c, 176 .channel_offset = 0x1c,
184 .timer_bit = 2, 177 .timer_bit = 2,
185 .clk = "peripheral_clk",
186}; 178};
187 179
188static struct resource tmu2_resources[] = { 180static struct resource tmu2_resources[] = {
189 [0] = { 181 [0] = {
190 .name = "TMU2",
191 .start = 0xffd80020, 182 .start = 0xffd80020,
192 .end = 0xffd8002f, 183 .end = 0xffd8002f,
193 .flags = IORESOURCE_MEM, 184 .flags = IORESOURCE_MEM,
@@ -209,15 +200,12 @@ static struct platform_device tmu2_device = {
209}; 200};
210 201
211static struct sh_timer_config tmu3_platform_data = { 202static struct sh_timer_config tmu3_platform_data = {
212 .name = "TMU3",
213 .channel_offset = 0x04, 203 .channel_offset = 0x04,
214 .timer_bit = 0, 204 .timer_bit = 0,
215 .clk = "peripheral_clk",
216}; 205};
217 206
218static struct resource tmu3_resources[] = { 207static struct resource tmu3_resources[] = {
219 [0] = { 208 [0] = {
220 .name = "TMU3",
221 .start = 0xffda0008, 209 .start = 0xffda0008,
222 .end = 0xffda0013, 210 .end = 0xffda0013,
223 .flags = IORESOURCE_MEM, 211 .flags = IORESOURCE_MEM,
@@ -239,15 +227,12 @@ static struct platform_device tmu3_device = {
239}; 227};
240 228
241static struct sh_timer_config tmu4_platform_data = { 229static struct sh_timer_config tmu4_platform_data = {
242 .name = "TMU4",
243 .channel_offset = 0x10, 230 .channel_offset = 0x10,
244 .timer_bit = 1, 231 .timer_bit = 1,
245 .clk = "peripheral_clk",
246}; 232};
247 233
248static struct resource tmu4_resources[] = { 234static struct resource tmu4_resources[] = {
249 [0] = { 235 [0] = {
250 .name = "TMU4",
251 .start = 0xffda0014, 236 .start = 0xffda0014,
252 .end = 0xffda001f, 237 .end = 0xffda001f,
253 .flags = IORESOURCE_MEM, 238 .flags = IORESOURCE_MEM,
@@ -269,15 +254,12 @@ static struct platform_device tmu4_device = {
269}; 254};
270 255
271static struct sh_timer_config tmu5_platform_data = { 256static struct sh_timer_config tmu5_platform_data = {
272 .name = "TMU5",
273 .channel_offset = 0x1c, 257 .channel_offset = 0x1c,
274 .timer_bit = 2, 258 .timer_bit = 2,
275 .clk = "peripheral_clk",
276}; 259};
277 260
278static struct resource tmu5_resources[] = { 261static struct resource tmu5_resources[] = {
279 [0] = { 262 [0] = {
280 .name = "TMU5",
281 .start = 0xffda0020, 263 .start = 0xffda0020,
282 .end = 0xffda002b, 264 .end = 0xffda002b,
283 .flags = IORESOURCE_MEM, 265 .flags = IORESOURCE_MEM,
@@ -299,15 +281,12 @@ static struct platform_device tmu5_device = {
299}; 281};
300 282
301static struct sh_timer_config tmu6_platform_data = { 283static struct sh_timer_config tmu6_platform_data = {
302 .name = "TMU6",
303 .channel_offset = 0x04, 284 .channel_offset = 0x04,
304 .timer_bit = 0, 285 .timer_bit = 0,
305 .clk = "peripheral_clk",
306}; 286};
307 287
308static struct resource tmu6_resources[] = { 288static struct resource tmu6_resources[] = {
309 [0] = { 289 [0] = {
310 .name = "TMU6",
311 .start = 0xffdc0008, 290 .start = 0xffdc0008,
312 .end = 0xffdc0013, 291 .end = 0xffdc0013,
313 .flags = IORESOURCE_MEM, 292 .flags = IORESOURCE_MEM,
@@ -329,15 +308,12 @@ static struct platform_device tmu6_device = {
329}; 308};
330 309
331static struct sh_timer_config tmu7_platform_data = { 310static struct sh_timer_config tmu7_platform_data = {
332 .name = "TMU7",
333 .channel_offset = 0x10, 311 .channel_offset = 0x10,
334 .timer_bit = 1, 312 .timer_bit = 1,
335 .clk = "peripheral_clk",
336}; 313};
337 314
338static struct resource tmu7_resources[] = { 315static struct resource tmu7_resources[] = {
339 [0] = { 316 [0] = {
340 .name = "TMU7",
341 .start = 0xffdc0014, 317 .start = 0xffdc0014,
342 .end = 0xffdc001f, 318 .end = 0xffdc001f,
343 .flags = IORESOURCE_MEM, 319 .flags = IORESOURCE_MEM,
@@ -359,15 +335,12 @@ static struct platform_device tmu7_device = {
359}; 335};
360 336
361static struct sh_timer_config tmu8_platform_data = { 337static struct sh_timer_config tmu8_platform_data = {
362 .name = "TMU8",
363 .channel_offset = 0x1c, 338 .channel_offset = 0x1c,
364 .timer_bit = 2, 339 .timer_bit = 2,
365 .clk = "peripheral_clk",
366}; 340};
367 341
368static struct resource tmu8_resources[] = { 342static struct resource tmu8_resources[] = {
369 [0] = { 343 [0] = {
370 .name = "TMU8",
371 .start = 0xffdc0020, 344 .start = 0xffdc0020,
372 .end = 0xffdc002b, 345 .end = 0xffdc002b,
373 .flags = IORESOURCE_MEM, 346 .flags = IORESOURCE_MEM,
@@ -389,15 +362,12 @@ static struct platform_device tmu8_device = {
389}; 362};
390 363
391static struct sh_timer_config tmu9_platform_data = { 364static struct sh_timer_config tmu9_platform_data = {
392 .name = "TMU9",
393 .channel_offset = 0x04, 365 .channel_offset = 0x04,
394 .timer_bit = 0, 366 .timer_bit = 0,
395 .clk = "peripheral_clk",
396}; 367};
397 368
398static struct resource tmu9_resources[] = { 369static struct resource tmu9_resources[] = {
399 [0] = { 370 [0] = {
400 .name = "TMU9",
401 .start = 0xffde0008, 371 .start = 0xffde0008,
402 .end = 0xffde0013, 372 .end = 0xffde0013,
403 .flags = IORESOURCE_MEM, 373 .flags = IORESOURCE_MEM,
@@ -419,15 +389,12 @@ static struct platform_device tmu9_device = {
419}; 389};
420 390
421static struct sh_timer_config tmu10_platform_data = { 391static struct sh_timer_config tmu10_platform_data = {
422 .name = "TMU10",
423 .channel_offset = 0x10, 392 .channel_offset = 0x10,
424 .timer_bit = 1, 393 .timer_bit = 1,
425 .clk = "peripheral_clk",
426}; 394};
427 395
428static struct resource tmu10_resources[] = { 396static struct resource tmu10_resources[] = {
429 [0] = { 397 [0] = {
430 .name = "TMU10",
431 .start = 0xffde0014, 398 .start = 0xffde0014,
432 .end = 0xffde001f, 399 .end = 0xffde001f,
433 .flags = IORESOURCE_MEM, 400 .flags = IORESOURCE_MEM,
@@ -449,15 +416,12 @@ static struct platform_device tmu10_device = {
449}; 416};
450 417
451static struct sh_timer_config tmu11_platform_data = { 418static struct sh_timer_config tmu11_platform_data = {
452 .name = "TMU11",
453 .channel_offset = 0x1c, 419 .channel_offset = 0x1c,
454 .timer_bit = 2, 420 .timer_bit = 2,
455 .clk = "peripheral_clk",
456}; 421};
457 422
458static struct resource tmu11_resources[] = { 423static struct resource tmu11_resources[] = {
459 [0] = { 424 [0] = {
460 .name = "TMU11",
461 .start = 0xffde0020, 425 .start = 0xffde0020,
462 .end = 0xffde002b, 426 .end = 0xffde002b,
463 .flags = IORESOURCE_MEM, 427 .flags = IORESOURCE_MEM,
diff --git a/arch/sh/kernel/cpu/sh4a/setup-shx3.c b/arch/sh/kernel/cpu/sh4a/setup-shx3.c
index 780ba17a5599..9158bc5ea38b 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-shx3.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-shx3.c
@@ -70,16 +70,13 @@ static struct platform_device scif2_device = {
70}; 70};
71 71
72static struct sh_timer_config tmu0_platform_data = { 72static struct sh_timer_config tmu0_platform_data = {
73 .name = "TMU0",
74 .channel_offset = 0x04, 73 .channel_offset = 0x04,
75 .timer_bit = 0, 74 .timer_bit = 0,
76 .clk = "peripheral_clk",
77 .clockevent_rating = 200, 75 .clockevent_rating = 200,
78}; 76};
79 77
80static struct resource tmu0_resources[] = { 78static struct resource tmu0_resources[] = {
81 [0] = { 79 [0] = {
82 .name = "TMU0",
83 .start = 0xffc10008, 80 .start = 0xffc10008,
84 .end = 0xffc10013, 81 .end = 0xffc10013,
85 .flags = IORESOURCE_MEM, 82 .flags = IORESOURCE_MEM,
@@ -101,16 +98,13 @@ static struct platform_device tmu0_device = {
101}; 98};
102 99
103static struct sh_timer_config tmu1_platform_data = { 100static struct sh_timer_config tmu1_platform_data = {
104 .name = "TMU1",
105 .channel_offset = 0x10, 101 .channel_offset = 0x10,
106 .timer_bit = 1, 102 .timer_bit = 1,
107 .clk = "peripheral_clk",
108 .clocksource_rating = 200, 103 .clocksource_rating = 200,
109}; 104};
110 105
111static struct resource tmu1_resources[] = { 106static struct resource tmu1_resources[] = {
112 [0] = { 107 [0] = {
113 .name = "TMU1",
114 .start = 0xffc10014, 108 .start = 0xffc10014,
115 .end = 0xffc1001f, 109 .end = 0xffc1001f,
116 .flags = IORESOURCE_MEM, 110 .flags = IORESOURCE_MEM,
@@ -132,15 +126,12 @@ static struct platform_device tmu1_device = {
132}; 126};
133 127
134static struct sh_timer_config tmu2_platform_data = { 128static struct sh_timer_config tmu2_platform_data = {
135 .name = "TMU2",
136 .channel_offset = 0x1c, 129 .channel_offset = 0x1c,
137 .timer_bit = 2, 130 .timer_bit = 2,
138 .clk = "peripheral_clk",
139}; 131};
140 132
141static struct resource tmu2_resources[] = { 133static struct resource tmu2_resources[] = {
142 [0] = { 134 [0] = {
143 .name = "TMU2",
144 .start = 0xffc10020, 135 .start = 0xffc10020,
145 .end = 0xffc1002f, 136 .end = 0xffc1002f,
146 .flags = IORESOURCE_MEM, 137 .flags = IORESOURCE_MEM,
@@ -162,15 +153,12 @@ static struct platform_device tmu2_device = {
162}; 153};
163 154
164static struct sh_timer_config tmu3_platform_data = { 155static struct sh_timer_config tmu3_platform_data = {
165 .name = "TMU3",
166 .channel_offset = 0x04, 156 .channel_offset = 0x04,
167 .timer_bit = 0, 157 .timer_bit = 0,
168 .clk = "peripheral_clk",
169}; 158};
170 159
171static struct resource tmu3_resources[] = { 160static struct resource tmu3_resources[] = {
172 [0] = { 161 [0] = {
173 .name = "TMU3",
174 .start = 0xffc20008, 162 .start = 0xffc20008,
175 .end = 0xffc20013, 163 .end = 0xffc20013,
176 .flags = IORESOURCE_MEM, 164 .flags = IORESOURCE_MEM,
@@ -192,15 +180,12 @@ static struct platform_device tmu3_device = {
192}; 180};
193 181
194static struct sh_timer_config tmu4_platform_data = { 182static struct sh_timer_config tmu4_platform_data = {
195 .name = "TMU4",
196 .channel_offset = 0x10, 183 .channel_offset = 0x10,
197 .timer_bit = 1, 184 .timer_bit = 1,
198 .clk = "peripheral_clk",
199}; 185};
200 186
201static struct resource tmu4_resources[] = { 187static struct resource tmu4_resources[] = {
202 [0] = { 188 [0] = {
203 .name = "TMU4",
204 .start = 0xffc20014, 189 .start = 0xffc20014,
205 .end = 0xffc2001f, 190 .end = 0xffc2001f,
206 .flags = IORESOURCE_MEM, 191 .flags = IORESOURCE_MEM,
@@ -222,15 +207,12 @@ static struct platform_device tmu4_device = {
222}; 207};
223 208
224static struct sh_timer_config tmu5_platform_data = { 209static struct sh_timer_config tmu5_platform_data = {
225 .name = "TMU5",
226 .channel_offset = 0x1c, 210 .channel_offset = 0x1c,
227 .timer_bit = 2, 211 .timer_bit = 2,
228 .clk = "peripheral_clk",
229}; 212};
230 213
231static struct resource tmu5_resources[] = { 214static struct resource tmu5_resources[] = {
232 [0] = { 215 [0] = {
233 .name = "TMU5",
234 .start = 0xffc20020, 216 .start = 0xffc20020,
235 .end = 0xffc2002b, 217 .end = 0xffc2002b,
236 .flags = IORESOURCE_MEM, 218 .flags = IORESOURCE_MEM,
diff --git a/arch/sh/kernel/cpu/sh5/setup-sh5.c b/arch/sh/kernel/cpu/sh5/setup-sh5.c
index e7a3c1e4b604..d910666142b1 100644
--- a/arch/sh/kernel/cpu/sh5/setup-sh5.c
+++ b/arch/sh/kernel/cpu/sh5/setup-sh5.c
@@ -68,16 +68,13 @@ static struct platform_device rtc_device = {
68#define TMU2_BASE (TMU_BASE + 0x8 + (0xc * 0x2)) 68#define TMU2_BASE (TMU_BASE + 0x8 + (0xc * 0x2))
69 69
70static struct sh_timer_config tmu0_platform_data = { 70static struct sh_timer_config tmu0_platform_data = {
71 .name = "TMU0",
72 .channel_offset = 0x04, 71 .channel_offset = 0x04,
73 .timer_bit = 0, 72 .timer_bit = 0,
74 .clk = "peripheral_clk",
75 .clockevent_rating = 200, 73 .clockevent_rating = 200,
76}; 74};
77 75
78static struct resource tmu0_resources[] = { 76static struct resource tmu0_resources[] = {
79 [0] = { 77 [0] = {
80 .name = "TMU0",
81 .start = TMU0_BASE, 78 .start = TMU0_BASE,
82 .end = TMU0_BASE + 0xc - 1, 79 .end = TMU0_BASE + 0xc - 1,
83 .flags = IORESOURCE_MEM, 80 .flags = IORESOURCE_MEM,
@@ -99,16 +96,13 @@ static struct platform_device tmu0_device = {
99}; 96};
100 97
101static struct sh_timer_config tmu1_platform_data = { 98static struct sh_timer_config tmu1_platform_data = {
102 .name = "TMU1",
103 .channel_offset = 0x10, 99 .channel_offset = 0x10,
104 .timer_bit = 1, 100 .timer_bit = 1,
105 .clk = "peripheral_clk",
106 .clocksource_rating = 200, 101 .clocksource_rating = 200,
107}; 102};
108 103
109static struct resource tmu1_resources[] = { 104static struct resource tmu1_resources[] = {
110 [0] = { 105 [0] = {
111 .name = "TMU1",
112 .start = TMU1_BASE, 106 .start = TMU1_BASE,
113 .end = TMU1_BASE + 0xc - 1, 107 .end = TMU1_BASE + 0xc - 1,
114 .flags = IORESOURCE_MEM, 108 .flags = IORESOURCE_MEM,
@@ -130,15 +124,12 @@ static struct platform_device tmu1_device = {
130}; 124};
131 125
132static struct sh_timer_config tmu2_platform_data = { 126static struct sh_timer_config tmu2_platform_data = {
133 .name = "TMU2",
134 .channel_offset = 0x1c, 127 .channel_offset = 0x1c,
135 .timer_bit = 2, 128 .timer_bit = 2,
136 .clk = "peripheral_clk",
137}; 129};
138 130
139static struct resource tmu2_resources[] = { 131static struct resource tmu2_resources[] = {
140 [0] = { 132 [0] = {
141 .name = "TMU2",
142 .start = TMU2_BASE, 133 .start = TMU2_BASE,
143 .end = TMU2_BASE + 0xc - 1, 134 .end = TMU2_BASE + 0xc - 1,
144 .flags = IORESOURCE_MEM, 135 .flags = IORESOURCE_MEM,
diff --git a/arch/sh/kernel/cpufreq.c b/arch/sh/kernel/cpufreq.c
index dce4f3ff0932..0fffacea6ed9 100644
--- a/arch/sh/kernel/cpufreq.c
+++ b/arch/sh/kernel/cpufreq.c
@@ -48,7 +48,7 @@ static int sh_cpufreq_target(struct cpufreq_policy *policy,
48 return -ENODEV; 48 return -ENODEV;
49 49
50 cpus_allowed = current->cpus_allowed; 50 cpus_allowed = current->cpus_allowed;
51 set_cpus_allowed(current, cpumask_of_cpu(cpu)); 51 set_cpus_allowed_ptr(current, cpumask_of(cpu));
52 52
53 BUG_ON(smp_processor_id() != cpu); 53 BUG_ON(smp_processor_id() != cpu);
54 54
@@ -66,7 +66,7 @@ static int sh_cpufreq_target(struct cpufreq_policy *policy,
66 freqs.flags = 0; 66 freqs.flags = 0;
67 67
68 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 68 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
69 set_cpus_allowed(current, cpus_allowed); 69 set_cpus_allowed_ptr(current, &cpus_allowed);
70 clk_set_rate(cpuclk, freq); 70 clk_set_rate(cpuclk, freq);
71 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 71 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
72 72
diff --git a/arch/sh/kernel/dwarf.c b/arch/sh/kernel/dwarf.c
index bd1c497280a6..94739ee7aa74 100644
--- a/arch/sh/kernel/dwarf.c
+++ b/arch/sh/kernel/dwarf.c
@@ -727,7 +727,7 @@ static int dwarf_parse_cie(void *entry, void *p, unsigned long len,
727 unsigned char *end, struct module *mod) 727 unsigned char *end, struct module *mod)
728{ 728{
729 struct rb_node **rb_node = &cie_root.rb_node; 729 struct rb_node **rb_node = &cie_root.rb_node;
730 struct rb_node *parent; 730 struct rb_node *parent = *rb_node;
731 struct dwarf_cie *cie; 731 struct dwarf_cie *cie;
732 unsigned long flags; 732 unsigned long flags;
733 int count; 733 int count;
@@ -856,7 +856,7 @@ static int dwarf_parse_fde(void *entry, u32 entry_type,
856 unsigned char *end, struct module *mod) 856 unsigned char *end, struct module *mod)
857{ 857{
858 struct rb_node **rb_node = &fde_root.rb_node; 858 struct rb_node **rb_node = &fde_root.rb_node;
859 struct rb_node *parent; 859 struct rb_node *parent = *rb_node;
860 struct dwarf_fde *fde; 860 struct dwarf_fde *fde;
861 struct dwarf_cie *cie; 861 struct dwarf_cie *cie;
862 unsigned long flags; 862 unsigned long flags;
diff --git a/arch/sh/kernel/idle.c b/arch/sh/kernel/idle.c
index 0fd7b41f0a22..273f890b17ae 100644
--- a/arch/sh/kernel/idle.c
+++ b/arch/sh/kernel/idle.c
@@ -112,7 +112,7 @@ void cpu_idle(void)
112 } 112 }
113} 113}
114 114
115void __cpuinit select_idle_routine(void) 115void __init select_idle_routine(void)
116{ 116{
117 /* 117 /*
118 * If a platform has set its own idle routine, leave it alone. 118 * If a platform has set its own idle routine, leave it alone.
diff --git a/arch/sh/kernel/perf_event.c b/arch/sh/kernel/perf_event.c
index 9f253e9cce01..81b6de41ae5d 100644
--- a/arch/sh/kernel/perf_event.c
+++ b/arch/sh/kernel/perf_event.c
@@ -315,7 +315,7 @@ void hw_perf_disable(void)
315 sh_pmu->disable_all(); 315 sh_pmu->disable_all();
316} 316}
317 317
318int register_sh_pmu(struct sh_pmu *pmu) 318int __cpuinit register_sh_pmu(struct sh_pmu *pmu)
319{ 319{
320 if (sh_pmu) 320 if (sh_pmu)
321 return -EBUSY; 321 return -EBUSY;
diff --git a/arch/sh/kernel/process_64.c b/arch/sh/kernel/process_64.c
index c90957a459ac..c0d40f671ecd 100644
--- a/arch/sh/kernel/process_64.c
+++ b/arch/sh/kernel/process_64.c
@@ -504,13 +504,6 @@ out:
504 return error; 504 return error;
505} 505}
506 506
507/*
508 * These bracket the sleeping functions..
509 */
510extern void interruptible_sleep_on(wait_queue_head_t *q);
511
512#define mid_sched ((unsigned long) interruptible_sleep_on)
513
514#ifdef CONFIG_FRAME_POINTER 507#ifdef CONFIG_FRAME_POINTER
515static int in_sh64_switch_to(unsigned long pc) 508static int in_sh64_switch_to(unsigned long pc)
516{ 509{
diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c
index e124cf7008df..002cc612deef 100644
--- a/arch/sh/kernel/smp.c
+++ b/arch/sh/kernel/smp.c
@@ -69,6 +69,7 @@ asmlinkage void __cpuinit start_secondary(void)
69 unsigned int cpu; 69 unsigned int cpu;
70 struct mm_struct *mm = &init_mm; 70 struct mm_struct *mm = &init_mm;
71 71
72 enable_mmu();
72 atomic_inc(&mm->mm_count); 73 atomic_inc(&mm->mm_count);
73 atomic_inc(&mm->mm_users); 74 atomic_inc(&mm->mm_users);
74 current->active_mm = mm; 75 current->active_mm = mm;
diff --git a/arch/sh/mm/Makefile b/arch/sh/mm/Makefile
index 3dc8a8a63822..c73018a9972c 100644
--- a/arch/sh/mm/Makefile
+++ b/arch/sh/mm/Makefile
@@ -18,13 +18,14 @@ mmu-$(CONFIG_MMU) := extable_$(BITS).o fault_$(BITS).o \
18 ioremap.o kmap.o pgtable.o tlbflush_$(BITS).o 18 ioremap.o kmap.o pgtable.o tlbflush_$(BITS).o
19 19
20obj-y += $(mmu-y) 20obj-y += $(mmu-y)
21obj-$(CONFIG_DEBUG_FS) += asids-debugfs.o
22 21
23ifdef CONFIG_DEBUG_FS 22debugfs-y := asids-debugfs.o
24obj-$(CONFIG_CPU_SH4) += cache-debugfs.o 23ifndef CONFIG_CACHE_OFF
24debugfs-$(CONFIG_CPU_SH4) += cache-debugfs.o
25endif 25endif
26 26
27ifdef CONFIG_MMU 27ifdef CONFIG_MMU
28debugfs-$(CONFIG_CPU_SH4) += tlb-debugfs.o
28tlb-$(CONFIG_CPU_SH3) := tlb-sh3.o 29tlb-$(CONFIG_CPU_SH3) := tlb-sh3.o
29tlb-$(CONFIG_CPU_SH4) := tlb-sh4.o tlb-urb.o 30tlb-$(CONFIG_CPU_SH4) := tlb-sh4.o tlb-urb.o
30tlb-$(CONFIG_CPU_SH5) := tlb-sh5.o 31tlb-$(CONFIG_CPU_SH5) := tlb-sh5.o
@@ -32,6 +33,7 @@ tlb-$(CONFIG_CPU_HAS_PTEAEX) := tlb-pteaex.o tlb-urb.o
32obj-y += $(tlb-y) 33obj-y += $(tlb-y)
33endif 34endif
34 35
36obj-$(CONFIG_DEBUG_FS) += $(debugfs-y)
35obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o 37obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
36obj-$(CONFIG_PMB) += pmb.o 38obj-$(CONFIG_PMB) += pmb.o
37obj-$(CONFIG_NUMA) += numa.o 39obj-$(CONFIG_NUMA) += numa.o
diff --git a/arch/sh/mm/pmb.c b/arch/sh/mm/pmb.c
index a4662e2782c3..3cc21933063b 100644
--- a/arch/sh/mm/pmb.c
+++ b/arch/sh/mm/pmb.c
@@ -323,6 +323,7 @@ static void __clear_pmb_entry(struct pmb_entry *pmbe)
323 writel_uncached(data_val & ~PMB_V, data); 323 writel_uncached(data_val & ~PMB_V, data);
324} 324}
325 325
326#ifdef CONFIG_PM
326static void set_pmb_entry(struct pmb_entry *pmbe) 327static void set_pmb_entry(struct pmb_entry *pmbe)
327{ 328{
328 unsigned long flags; 329 unsigned long flags;
@@ -331,6 +332,7 @@ static void set_pmb_entry(struct pmb_entry *pmbe)
331 __set_pmb_entry(pmbe); 332 __set_pmb_entry(pmbe);
332 spin_unlock_irqrestore(&pmbe->lock, flags); 333 spin_unlock_irqrestore(&pmbe->lock, flags);
333} 334}
335#endif /* CONFIG_PM */
334 336
335int pmb_bolt_mapping(unsigned long vaddr, phys_addr_t phys, 337int pmb_bolt_mapping(unsigned long vaddr, phys_addr_t phys,
336 unsigned long size, pgprot_t prot) 338 unsigned long size, pgprot_t prot)
@@ -802,7 +804,7 @@ void __init pmb_init(void)
802 writel_uncached(0, PMB_IRMCR); 804 writel_uncached(0, PMB_IRMCR);
803 805
804 /* Flush out the TLB */ 806 /* Flush out the TLB */
805 __raw_writel(__raw_readl(MMUCR) | MMUCR_TI, MMUCR); 807 local_flush_tlb_all();
806 ctrl_barrier(); 808 ctrl_barrier();
807} 809}
808 810
diff --git a/arch/sh/mm/tlb-debugfs.c b/arch/sh/mm/tlb-debugfs.c
new file mode 100644
index 000000000000..229bf75f28df
--- /dev/null
+++ b/arch/sh/mm/tlb-debugfs.c
@@ -0,0 +1,179 @@
1/*
2 * arch/sh/mm/tlb-debugfs.c
3 *
4 * debugfs ops for SH-4 ITLB/UTLBs.
5 *
6 * Copyright (C) 2010 Matt Fleming
7 *
8 * This file is subject to the terms and conditions of the GNU General Public
9 * License. See the file "COPYING" in the main directory of this archive
10 * for more details.
11 */
12#include <linux/init.h>
13#include <linux/module.h>
14#include <linux/debugfs.h>
15#include <linux/seq_file.h>
16#include <asm/processor.h>
17#include <asm/mmu_context.h>
18#include <asm/tlbflush.h>
19
20enum tlb_type {
21 TLB_TYPE_ITLB,
22 TLB_TYPE_UTLB,
23};
24
25static struct {
26 int bits;
27 const char *size;
28} tlb_sizes[] = {
29 { 0x0, " 1KB" },
30 { 0x1, " 4KB" },
31 { 0x2, " 8KB" },
32 { 0x4, " 64KB" },
33 { 0x5, "256KB" },
34 { 0x7, " 1MB" },
35 { 0x8, " 4MB" },
36 { 0xc, " 64MB" },
37};
38
39static int tlb_seq_show(struct seq_file *file, void *iter)
40{
41 unsigned int tlb_type = (unsigned int)file->private;
42 unsigned long addr1, addr2, data1, data2;
43 unsigned long flags;
44 unsigned long mmucr;
45 unsigned int nentries, entry;
46 unsigned int urb;
47
48 mmucr = __raw_readl(MMUCR);
49 if ((mmucr & 0x1) == 0) {
50 seq_printf(file, "address translation disabled\n");
51 return 0;
52 }
53
54 if (tlb_type == TLB_TYPE_ITLB) {
55 addr1 = MMU_ITLB_ADDRESS_ARRAY;
56 addr2 = MMU_ITLB_ADDRESS_ARRAY2;
57 data1 = MMU_ITLB_DATA_ARRAY;
58 data2 = MMU_ITLB_DATA_ARRAY2;
59 nentries = 4;
60 } else {
61 addr1 = MMU_UTLB_ADDRESS_ARRAY;
62 addr2 = MMU_UTLB_ADDRESS_ARRAY2;
63 data1 = MMU_UTLB_DATA_ARRAY;
64 data2 = MMU_UTLB_DATA_ARRAY2;
65 nentries = 64;
66 }
67
68 local_irq_save(flags);
69 jump_to_uncached();
70
71 urb = (mmucr & MMUCR_URB) >> MMUCR_URB_SHIFT;
72
73 /* Make the "entry >= urb" test fail. */
74 if (urb == 0)
75 urb = MMUCR_URB_NENTRIES + 1;
76
77 if (tlb_type == TLB_TYPE_ITLB) {
78 addr1 = MMU_ITLB_ADDRESS_ARRAY;
79 addr2 = MMU_ITLB_ADDRESS_ARRAY2;
80 data1 = MMU_ITLB_DATA_ARRAY;
81 data2 = MMU_ITLB_DATA_ARRAY2;
82 nentries = 4;
83 } else {
84 addr1 = MMU_UTLB_ADDRESS_ARRAY;
85 addr2 = MMU_UTLB_ADDRESS_ARRAY2;
86 data1 = MMU_UTLB_DATA_ARRAY;
87 data2 = MMU_UTLB_DATA_ARRAY2;
88 nentries = 64;
89 }
90
91 seq_printf(file, "entry: vpn ppn asid size valid wired\n");
92
93 for (entry = 0; entry < nentries; entry++) {
94 unsigned long vpn, ppn, asid, size;
95 unsigned long valid;
96 unsigned long val;
97 const char *sz = " ?";
98 int i;
99
100 val = __raw_readl(addr1 | (entry << MMU_TLB_ENTRY_SHIFT));
101 ctrl_barrier();
102 vpn = val & 0xfffffc00;
103 valid = val & 0x100;
104
105 val = __raw_readl(addr2 | (entry << MMU_TLB_ENTRY_SHIFT));
106 ctrl_barrier();
107 asid = val & MMU_CONTEXT_ASID_MASK;
108
109 val = __raw_readl(data1 | (entry << MMU_TLB_ENTRY_SHIFT));
110 ctrl_barrier();
111 ppn = (val & 0x0ffffc00) << 4;
112
113 val = __raw_readl(data2 | (entry << MMU_TLB_ENTRY_SHIFT));
114 ctrl_barrier();
115 size = (val & 0xf0) >> 4;
116
117 for (i = 0; i < ARRAY_SIZE(tlb_sizes); i++) {
118 if (tlb_sizes[i].bits == size)
119 break;
120 }
121
122 if (i != ARRAY_SIZE(tlb_sizes))
123 sz = tlb_sizes[i].size;
124
125 seq_printf(file, "%2d: 0x%08lx 0x%08lx %5lu %s %s %s\n",
126 entry, vpn, ppn, asid,
127 sz, valid ? "V" : "-",
128 (urb <= entry) ? "W" : "-");
129 }
130
131 back_to_cached();
132 local_irq_restore(flags);
133
134 return 0;
135}
136
137static int tlb_debugfs_open(struct inode *inode, struct file *file)
138{
139 return single_open(file, tlb_seq_show, inode->i_private);
140}
141
142static const struct file_operations tlb_debugfs_fops = {
143 .owner = THIS_MODULE,
144 .open = tlb_debugfs_open,
145 .read = seq_read,
146 .llseek = seq_lseek,
147 .release = single_release,
148};
149
150static int __init tlb_debugfs_init(void)
151{
152 struct dentry *itlb, *utlb;
153
154 itlb = debugfs_create_file("itlb", S_IRUSR, sh_debugfs_root,
155 (unsigned int *)TLB_TYPE_ITLB,
156 &tlb_debugfs_fops);
157 if (unlikely(!itlb))
158 return -ENOMEM;
159 if (IS_ERR(itlb))
160 return PTR_ERR(itlb);
161
162 utlb = debugfs_create_file("utlb", S_IRUSR, sh_debugfs_root,
163 (unsigned int *)TLB_TYPE_UTLB,
164 &tlb_debugfs_fops);
165 if (unlikely(!utlb)) {
166 debugfs_remove(itlb);
167 return -ENOMEM;
168 }
169
170 if (IS_ERR(utlb)) {
171 debugfs_remove(itlb);
172 return PTR_ERR(utlb);
173 }
174
175 return 0;
176}
177module_init(tlb_debugfs_init);
178
179MODULE_LICENSE("GPL v2");
diff --git a/arch/sh/mm/tlb-pteaex.c b/arch/sh/mm/tlb-pteaex.c
index 32dc674c550c..bdd0982b56ee 100644
--- a/arch/sh/mm/tlb-pteaex.c
+++ b/arch/sh/mm/tlb-pteaex.c
@@ -73,5 +73,7 @@ void local_flush_tlb_one(unsigned long asid, unsigned long page)
73 jump_to_uncached(); 73 jump_to_uncached();
74 __raw_writel(page, MMU_UTLB_ADDRESS_ARRAY | MMU_PAGE_ASSOC_BIT); 74 __raw_writel(page, MMU_UTLB_ADDRESS_ARRAY | MMU_PAGE_ASSOC_BIT);
75 __raw_writel(asid, MMU_UTLB_ADDRESS_ARRAY2 | MMU_PAGE_ASSOC_BIT); 75 __raw_writel(asid, MMU_UTLB_ADDRESS_ARRAY2 | MMU_PAGE_ASSOC_BIT);
76 __raw_writel(page, MMU_ITLB_ADDRESS_ARRAY | MMU_PAGE_ASSOC_BIT);
77 __raw_writel(asid, MMU_ITLB_ADDRESS_ARRAY2 | MMU_PAGE_ASSOC_BIT);
76 back_to_cached(); 78 back_to_cached();
77} 79}
diff --git a/arch/sh/mm/tlb-urb.c b/arch/sh/mm/tlb-urb.c
index bb5b9098956d..c92ce20db39b 100644
--- a/arch/sh/mm/tlb-urb.c
+++ b/arch/sh/mm/tlb-urb.c
@@ -24,13 +24,9 @@ void tlb_wire_entry(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
24 24
25 local_irq_save(flags); 25 local_irq_save(flags);
26 26
27 /* Load the entry into the TLB */
28 __update_tlb(vma, addr, pte);
29
30 /* ... and wire it up. */
31 status = __raw_readl(MMUCR); 27 status = __raw_readl(MMUCR);
32 urb = (status & MMUCR_URB) >> MMUCR_URB_SHIFT; 28 urb = (status & MMUCR_URB) >> MMUCR_URB_SHIFT;
33 status &= ~MMUCR_URB; 29 status &= ~MMUCR_URC;
34 30
35 /* 31 /*
36 * Make sure we're not trying to wire the last TLB entry slot. 32 * Make sure we're not trying to wire the last TLB entry slot.
@@ -39,7 +35,23 @@ void tlb_wire_entry(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
39 35
40 urb = urb % MMUCR_URB_NENTRIES; 36 urb = urb % MMUCR_URB_NENTRIES;
41 37
38 /*
39 * Insert this entry into the highest non-wired TLB slot (via
40 * the URC field).
41 */
42 status |= (urb << MMUCR_URC_SHIFT);
43 __raw_writel(status, MMUCR);
44 ctrl_barrier();
45
46 /* Load the entry into the TLB */
47 __update_tlb(vma, addr, pte);
48
49 /* ... and wire it up. */
50 status = __raw_readl(MMUCR);
51
52 status &= ~MMUCR_URB;
42 status |= (urb << MMUCR_URB_SHIFT); 53 status |= (urb << MMUCR_URB_SHIFT);
54
43 __raw_writel(status, MMUCR); 55 __raw_writel(status, MMUCR);
44 ctrl_barrier(); 56 ctrl_barrier();
45 57
diff --git a/arch/sh/mm/tlbflush_32.c b/arch/sh/mm/tlbflush_32.c
index 004bb3f25b5f..77dc5efa7127 100644
--- a/arch/sh/mm/tlbflush_32.c
+++ b/arch/sh/mm/tlbflush_32.c
@@ -123,18 +123,27 @@ void local_flush_tlb_mm(struct mm_struct *mm)
123void local_flush_tlb_all(void) 123void local_flush_tlb_all(void)
124{ 124{
125 unsigned long flags, status; 125 unsigned long flags, status;
126 int i;
126 127
127 /* 128 /*
128 * Flush all the TLB. 129 * Flush all the TLB.
129 *
130 * Write to the MMU control register's bit:
131 * TF-bit for SH-3, TI-bit for SH-4.
132 * It's same position, bit #2.
133 */ 130 */
134 local_irq_save(flags); 131 local_irq_save(flags);
132 jump_to_uncached();
133
135 status = __raw_readl(MMUCR); 134 status = __raw_readl(MMUCR);
136 status |= 0x04; 135 status = ((status & MMUCR_URB) >> MMUCR_URB_SHIFT);
137 __raw_writel(status, MMUCR); 136
137 if (status == 0)
138 status = MMUCR_URB_NENTRIES;
139
140 for (i = 0; i < status; i++)
141 __raw_writel(0x0, MMU_UTLB_ADDRESS_ARRAY | (i << 8));
142
143 for (i = 0; i < 4; i++)
144 __raw_writel(0x0, MMU_ITLB_ADDRESS_ARRAY | (i << 8));
145
146 back_to_cached();
138 ctrl_barrier(); 147 ctrl_barrier();
139 local_irq_restore(flags); 148 local_irq_restore(flags);
140} 149}
diff --git a/arch/sh/mm/uncached.c b/arch/sh/mm/uncached.c
index cf20a5c5136a..8a4eca551fc0 100644
--- a/arch/sh/mm/uncached.c
+++ b/arch/sh/mm/uncached.c
@@ -1,6 +1,8 @@
1#include <linux/init.h> 1#include <linux/init.h>
2#include <linux/module.h>
2#include <asm/sizes.h> 3#include <asm/sizes.h>
3#include <asm/page.h> 4#include <asm/page.h>
5#include <asm/addrspace.h>
4 6
5/* 7/*
6 * This is the offset of the uncached section from its cached alias. 8 * This is the offset of the uncached section from its cached alias.
@@ -15,15 +17,22 @@
15unsigned long cached_to_uncached = SZ_512M; 17unsigned long cached_to_uncached = SZ_512M;
16unsigned long uncached_size = SZ_512M; 18unsigned long uncached_size = SZ_512M;
17unsigned long uncached_start, uncached_end; 19unsigned long uncached_start, uncached_end;
20EXPORT_SYMBOL(uncached_start);
21EXPORT_SYMBOL(uncached_end);
18 22
19int virt_addr_uncached(unsigned long kaddr) 23int virt_addr_uncached(unsigned long kaddr)
20{ 24{
21 return (kaddr >= uncached_start) && (kaddr < uncached_end); 25 return (kaddr >= uncached_start) && (kaddr < uncached_end);
22} 26}
27EXPORT_SYMBOL(virt_addr_uncached);
23 28
24void __init uncached_init(void) 29void __init uncached_init(void)
25{ 30{
31#ifdef CONFIG_29BIT
32 uncached_start = P2SEG;
33#else
26 uncached_start = memory_end; 34 uncached_start = memory_end;
35#endif
27 uncached_end = uncached_start + uncached_size; 36 uncached_end = uncached_start + uncached_size;
28} 37}
29 38
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 573458f1caf2..b87e0b6970cb 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -348,10 +348,12 @@ static void amd_pmu_cpu_offline(int cpu)
348 348
349 raw_spin_lock(&amd_nb_lock); 349 raw_spin_lock(&amd_nb_lock);
350 350
351 if (--cpuhw->amd_nb->refcnt == 0) 351 if (cpuhw->amd_nb) {
352 kfree(cpuhw->amd_nb); 352 if (--cpuhw->amd_nb->refcnt == 0)
353 kfree(cpuhw->amd_nb);
353 354
354 cpuhw->amd_nb = NULL; 355 cpuhw->amd_nb = NULL;
356 }
355 357
356 raw_spin_unlock(&amd_nb_lock); 358 raw_spin_unlock(&amd_nb_lock);
357} 359}
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index fb7fc24fe727..189cbc2585fa 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -8,6 +8,7 @@
8#include <linux/acpi.h> 8#include <linux/acpi.h>
9#include <linux/signal.h> 9#include <linux/signal.h>
10#include <linux/kthread.h> 10#include <linux/kthread.h>
11#include <linux/dmi.h>
11 12
12#include <acpi/acpi_drivers.h> 13#include <acpi/acpi_drivers.h>
13 14
@@ -1032,6 +1033,41 @@ static void acpi_add_id(struct acpi_device *device, const char *dev_id)
1032 list_add_tail(&id->list, &device->pnp.ids); 1033 list_add_tail(&id->list, &device->pnp.ids);
1033} 1034}
1034 1035
1036/*
1037 * Old IBM workstations have a DSDT bug wherein the SMBus object
1038 * lacks the SMBUS01 HID and the methods do not have the necessary "_"
1039 * prefix. Work around this.
1040 */
1041static int acpi_ibm_smbus_match(struct acpi_device *device)
1042{
1043 acpi_handle h_dummy;
1044 struct acpi_buffer path = {ACPI_ALLOCATE_BUFFER, NULL};
1045 int result;
1046
1047 if (!dmi_name_in_vendors("IBM"))
1048 return -ENODEV;
1049
1050 /* Look for SMBS object */
1051 result = acpi_get_name(device->handle, ACPI_SINGLE_NAME, &path);
1052 if (result)
1053 return result;
1054
1055 if (strcmp("SMBS", path.pointer)) {
1056 result = -ENODEV;
1057 goto out;
1058 }
1059
1060 /* Does it have the necessary (but misnamed) methods? */
1061 result = -ENODEV;
1062 if (ACPI_SUCCESS(acpi_get_handle(device->handle, "SBI", &h_dummy)) &&
1063 ACPI_SUCCESS(acpi_get_handle(device->handle, "SBR", &h_dummy)) &&
1064 ACPI_SUCCESS(acpi_get_handle(device->handle, "SBW", &h_dummy)))
1065 result = 0;
1066out:
1067 kfree(path.pointer);
1068 return result;
1069}
1070
1035static void acpi_device_set_id(struct acpi_device *device) 1071static void acpi_device_set_id(struct acpi_device *device)
1036{ 1072{
1037 acpi_status status; 1073 acpi_status status;
@@ -1082,6 +1118,8 @@ static void acpi_device_set_id(struct acpi_device *device)
1082 acpi_add_id(device, ACPI_BAY_HID); 1118 acpi_add_id(device, ACPI_BAY_HID);
1083 else if (ACPI_SUCCESS(acpi_dock_match(device))) 1119 else if (ACPI_SUCCESS(acpi_dock_match(device)))
1084 acpi_add_id(device, ACPI_DOCK_HID); 1120 acpi_add_id(device, ACPI_DOCK_HID);
1121 else if (!acpi_ibm_smbus_match(device))
1122 acpi_add_id(device, ACPI_SMBUS_IBM_HID);
1085 1123
1086 break; 1124 break;
1087 case ACPI_BUS_TYPE_POWER: 1125 case ACPI_BUS_TYPE_POWER:
diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
index 561dec2481cb..277477251a86 100644
--- a/drivers/ata/libata-sff.c
+++ b/drivers/ata/libata-sff.c
@@ -1667,6 +1667,7 @@ unsigned int ata_sff_host_intr(struct ata_port *ap,
1667{ 1667{
1668 struct ata_eh_info *ehi = &ap->link.eh_info; 1668 struct ata_eh_info *ehi = &ap->link.eh_info;
1669 u8 status, host_stat = 0; 1669 u8 status, host_stat = 0;
1670 bool bmdma_stopped = false;
1670 1671
1671 VPRINTK("ata%u: protocol %d task_state %d\n", 1672 VPRINTK("ata%u: protocol %d task_state %d\n",
1672 ap->print_id, qc->tf.protocol, ap->hsm_task_state); 1673 ap->print_id, qc->tf.protocol, ap->hsm_task_state);
@@ -1699,6 +1700,7 @@ unsigned int ata_sff_host_intr(struct ata_port *ap,
1699 1700
1700 /* before we do anything else, clear DMA-Start bit */ 1701 /* before we do anything else, clear DMA-Start bit */
1701 ap->ops->bmdma_stop(qc); 1702 ap->ops->bmdma_stop(qc);
1703 bmdma_stopped = true;
1702 1704
1703 if (unlikely(host_stat & ATA_DMA_ERR)) { 1705 if (unlikely(host_stat & ATA_DMA_ERR)) {
1704 /* error when transfering data to/from memory */ 1706 /* error when transfering data to/from memory */
@@ -1716,8 +1718,14 @@ unsigned int ata_sff_host_intr(struct ata_port *ap,
1716 1718
1717 /* check main status, clearing INTRQ if needed */ 1719 /* check main status, clearing INTRQ if needed */
1718 status = ata_sff_irq_status(ap); 1720 status = ata_sff_irq_status(ap);
1719 if (status & ATA_BUSY) 1721 if (status & ATA_BUSY) {
1720 goto idle_irq; 1722 if (bmdma_stopped) {
1723 /* BMDMA engine is already stopped, we're screwed */
1724 qc->err_mask |= AC_ERR_HSM;
1725 ap->hsm_task_state = HSM_ST_ERR;
1726 } else
1727 goto idle_irq;
1728 }
1721 1729
1722 /* ack bmdma irq events */ 1730 /* ack bmdma irq events */
1723 ap->ops->sff_irq_clear(ap); 1731 ap->ops->sff_irq_clear(ap);
@@ -1762,13 +1770,16 @@ EXPORT_SYMBOL_GPL(ata_sff_host_intr);
1762irqreturn_t ata_sff_interrupt(int irq, void *dev_instance) 1770irqreturn_t ata_sff_interrupt(int irq, void *dev_instance)
1763{ 1771{
1764 struct ata_host *host = dev_instance; 1772 struct ata_host *host = dev_instance;
1773 bool retried = false;
1765 unsigned int i; 1774 unsigned int i;
1766 unsigned int handled = 0, polling = 0; 1775 unsigned int handled, idle, polling;
1767 unsigned long flags; 1776 unsigned long flags;
1768 1777
1769 /* TODO: make _irqsave conditional on x86 PCI IDE legacy mode */ 1778 /* TODO: make _irqsave conditional on x86 PCI IDE legacy mode */
1770 spin_lock_irqsave(&host->lock, flags); 1779 spin_lock_irqsave(&host->lock, flags);
1771 1780
1781retry:
1782 handled = idle = polling = 0;
1772 for (i = 0; i < host->n_ports; i++) { 1783 for (i = 0; i < host->n_ports; i++) {
1773 struct ata_port *ap = host->ports[i]; 1784 struct ata_port *ap = host->ports[i];
1774 struct ata_queued_cmd *qc; 1785 struct ata_queued_cmd *qc;
@@ -1782,7 +1793,8 @@ irqreturn_t ata_sff_interrupt(int irq, void *dev_instance)
1782 handled |= ata_sff_host_intr(ap, qc); 1793 handled |= ata_sff_host_intr(ap, qc);
1783 else 1794 else
1784 polling |= 1 << i; 1795 polling |= 1 << i;
1785 } 1796 } else
1797 idle |= 1 << i;
1786 } 1798 }
1787 1799
1788 /* 1800 /*
@@ -1790,7 +1802,9 @@ irqreturn_t ata_sff_interrupt(int irq, void *dev_instance)
1790 * asserting IRQ line, nobody cared will ensue. Check IRQ 1802 * asserting IRQ line, nobody cared will ensue. Check IRQ
1791 * pending status if available and clear spurious IRQ. 1803 * pending status if available and clear spurious IRQ.
1792 */ 1804 */
1793 if (!handled) { 1805 if (!handled && !retried) {
1806 bool retry = false;
1807
1794 for (i = 0; i < host->n_ports; i++) { 1808 for (i = 0; i < host->n_ports; i++) {
1795 struct ata_port *ap = host->ports[i]; 1809 struct ata_port *ap = host->ports[i];
1796 1810
@@ -1805,8 +1819,23 @@ irqreturn_t ata_sff_interrupt(int irq, void *dev_instance)
1805 ata_port_printk(ap, KERN_INFO, 1819 ata_port_printk(ap, KERN_INFO,
1806 "clearing spurious IRQ\n"); 1820 "clearing spurious IRQ\n");
1807 1821
1808 ap->ops->sff_check_status(ap); 1822 if (idle & (1 << i)) {
1809 ap->ops->sff_irq_clear(ap); 1823 ap->ops->sff_check_status(ap);
1824 ap->ops->sff_irq_clear(ap);
1825 } else {
1826 /* clear INTRQ and check if BUSY cleared */
1827 if (!(ap->ops->sff_check_status(ap) & ATA_BUSY))
1828 retry |= true;
1829 /*
1830 * With command in flight, we can't do
1831 * sff_irq_clear() w/o racing with completion.
1832 */
1833 }
1834 }
1835
1836 if (retry) {
1837 retried = true;
1838 goto retry;
1810 } 1839 }
1811 } 1840 }
1812 1841
diff --git a/drivers/ata/pata_via.c b/drivers/ata/pata_via.c
index 3059ec017de3..95d39c36acea 100644
--- a/drivers/ata/pata_via.c
+++ b/drivers/ata/pata_via.c
@@ -677,6 +677,7 @@ static const struct pci_device_id via[] = {
677 { PCI_VDEVICE(VIA, 0x3164), }, 677 { PCI_VDEVICE(VIA, 0x3164), },
678 { PCI_VDEVICE(VIA, 0x5324), }, 678 { PCI_VDEVICE(VIA, 0x5324), },
679 { PCI_VDEVICE(VIA, 0xC409), VIA_IDFLAG_SINGLE }, 679 { PCI_VDEVICE(VIA, 0xC409), VIA_IDFLAG_SINGLE },
680 { PCI_VDEVICE(VIA, 0x9001), VIA_IDFLAG_SINGLE },
680 681
681 { }, 682 { },
682}; 683};
diff --git a/drivers/base/class.c b/drivers/base/class.c
index 0147f476b8a9..9c6a0d6408e7 100644
--- a/drivers/base/class.c
+++ b/drivers/base/class.c
@@ -219,6 +219,8 @@ static void class_create_release(struct class *cls)
219 * This is used to create a struct class pointer that can then be used 219 * This is used to create a struct class pointer that can then be used
220 * in calls to device_create(). 220 * in calls to device_create().
221 * 221 *
222 * Returns &struct class pointer on success, or ERR_PTR() on error.
223 *
222 * Note, the pointer created here is to be destroyed when finished by 224 * Note, the pointer created here is to be destroyed when finished by
223 * making a call to class_destroy(). 225 * making a call to class_destroy().
224 */ 226 */
diff --git a/drivers/base/core.c b/drivers/base/core.c
index ef55df34ddd0..b56a0ba31d4a 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -1345,6 +1345,8 @@ static void root_device_release(struct device *dev)
1345 * 'module' symlink which points to the @owner directory 1345 * 'module' symlink which points to the @owner directory
1346 * in sysfs. 1346 * in sysfs.
1347 * 1347 *
1348 * Returns &struct device pointer on success, or ERR_PTR() on error.
1349 *
1348 * Note: You probably want to use root_device_register(). 1350 * Note: You probably want to use root_device_register().
1349 */ 1351 */
1350struct device *__root_device_register(const char *name, struct module *owner) 1352struct device *__root_device_register(const char *name, struct module *owner)
@@ -1432,6 +1434,8 @@ static void device_create_release(struct device *dev)
1432 * Any further sysfs files that might be required can be created using this 1434 * Any further sysfs files that might be required can be created using this
1433 * pointer. 1435 * pointer.
1434 * 1436 *
1437 * Returns &struct device pointer on success, or ERR_PTR() on error.
1438 *
1435 * Note: the struct class passed to this function must have previously 1439 * Note: the struct class passed to this function must have previously
1436 * been created with a call to class_create(). 1440 * been created with a call to class_create().
1437 */ 1441 */
@@ -1492,6 +1496,8 @@ EXPORT_SYMBOL_GPL(device_create_vargs);
1492 * Any further sysfs files that might be required can be created using this 1496 * Any further sysfs files that might be required can be created using this
1493 * pointer. 1497 * pointer.
1494 * 1498 *
1499 * Returns &struct device pointer on success, or ERR_PTR() on error.
1500 *
1495 * Note: the struct class passed to this function must have previously 1501 * Note: the struct class passed to this function must have previously
1496 * been created with a call to class_create(). 1502 * been created with a call to class_create().
1497 */ 1503 */
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 7036e8e96ab8..b5242e1e8bc4 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -79,24 +79,24 @@ void unregister_cpu(struct cpu *cpu)
79} 79}
80 80
81#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE 81#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
82static ssize_t cpu_probe_store(struct sys_device *dev, 82static ssize_t cpu_probe_store(struct sysdev_class *class,
83 struct sysdev_attribute *attr, 83 struct sysdev_class_attribute *attr,
84 const char *buf, 84 const char *buf,
85 size_t count) 85 size_t count)
86{ 86{
87 return arch_cpu_probe(buf, count); 87 return arch_cpu_probe(buf, count);
88} 88}
89 89
90static ssize_t cpu_release_store(struct sys_device *dev, 90static ssize_t cpu_release_store(struct sysdev_class *class,
91 struct sysdev_attribute *attr, 91 struct sysdev_class_attribute *attr,
92 const char *buf, 92 const char *buf,
93 size_t count) 93 size_t count)
94{ 94{
95 return arch_cpu_release(buf, count); 95 return arch_cpu_release(buf, count);
96} 96}
97 97
98static SYSDEV_ATTR(probe, S_IWUSR, NULL, cpu_probe_store); 98static SYSDEV_CLASS_ATTR(probe, S_IWUSR, NULL, cpu_probe_store);
99static SYSDEV_ATTR(release, S_IWUSR, NULL, cpu_release_store); 99static SYSDEV_CLASS_ATTR(release, S_IWUSR, NULL, cpu_release_store);
100#endif /* CONFIG_ARCH_CPU_PROBE_RELEASE */ 100#endif /* CONFIG_ARCH_CPU_PROBE_RELEASE */
101 101
102#else /* ... !CONFIG_HOTPLUG_CPU */ 102#else /* ... !CONFIG_HOTPLUG_CPU */
diff --git a/drivers/base/firmware_class.c b/drivers/base/firmware_class.c
index d0dc26ad5387..18518ba13c81 100644
--- a/drivers/base/firmware_class.c
+++ b/drivers/base/firmware_class.c
@@ -78,6 +78,7 @@ firmware_timeout_show(struct class *class,
78/** 78/**
79 * firmware_timeout_store - set number of seconds to wait for firmware 79 * firmware_timeout_store - set number of seconds to wait for firmware
80 * @class: device class pointer 80 * @class: device class pointer
81 * @attr: device attribute pointer
81 * @buf: buffer to scan for timeout value 82 * @buf: buffer to scan for timeout value
82 * @count: number of bytes in @buf 83 * @count: number of bytes in @buf
83 * 84 *
@@ -442,6 +443,7 @@ static int fw_setup_device(struct firmware *fw, struct device **dev_p,
442 fw_priv = dev_get_drvdata(f_dev); 443 fw_priv = dev_get_drvdata(f_dev);
443 444
444 fw_priv->fw = fw; 445 fw_priv->fw = fw;
446 sysfs_bin_attr_init(&fw_priv->attr_data);
445 retval = sysfs_create_bin_file(&f_dev->kobj, &fw_priv->attr_data); 447 retval = sysfs_create_bin_file(&f_dev->kobj, &fw_priv->attr_data);
446 if (retval) { 448 if (retval) {
447 dev_err(device, "%s: sysfs_create_bin_file failed\n", __func__); 449 dev_err(device, "%s: sysfs_create_bin_file failed\n", __func__);
diff --git a/drivers/base/node.c b/drivers/base/node.c
index ad43185ec15a..93b3ac65c2d4 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -165,8 +165,11 @@ static ssize_t node_read_distance(struct sys_device * dev,
165 int len = 0; 165 int len = 0;
166 int i; 166 int i;
167 167
168 /* buf currently PAGE_SIZE, need ~4 chars per node */ 168 /*
169 BUILD_BUG_ON(MAX_NUMNODES*4 > PAGE_SIZE/2); 169 * buf is currently PAGE_SIZE in length and each node needs 4 chars
170 * at the most (distance + space or newline).
171 */
172 BUILD_BUG_ON(MAX_NUMNODES * 4 > PAGE_SIZE);
170 173
171 for_each_online_node(i) 174 for_each_online_node(i)
172 len += sprintf(buf + len, "%s%d", i ? " " : "", node_distance(nid, i)); 175 len += sprintf(buf + len, "%s%d", i ? " " : "", node_distance(nid, i));
diff --git a/drivers/base/platform.c b/drivers/base/platform.c
index 1ba9d617d241..d10230adeb36 100644
--- a/drivers/base/platform.c
+++ b/drivers/base/platform.c
@@ -362,6 +362,8 @@ EXPORT_SYMBOL_GPL(platform_device_unregister);
362 * enumeration tasks, they don't fully conform to the Linux driver model. 362 * enumeration tasks, they don't fully conform to the Linux driver model.
363 * In particular, when such drivers are built as modules, they can't be 363 * In particular, when such drivers are built as modules, they can't be
364 * "hotplugged". 364 * "hotplugged".
365 *
366 * Returns &struct platform_device pointer on success, or ERR_PTR() on error.
365 */ 367 */
366struct platform_device *platform_device_register_simple(const char *name, 368struct platform_device *platform_device_register_simple(const char *name,
367 int id, 369 int id,
@@ -408,6 +410,8 @@ EXPORT_SYMBOL_GPL(platform_device_register_simple);
408 * allocated for the device allows drivers using such devices to be 410 * allocated for the device allows drivers using such devices to be
409 * unloaded without waiting for the last reference to the device to be 411 * unloaded without waiting for the last reference to the device to be
410 * dropped. 412 * dropped.
413 *
414 * Returns &struct platform_device pointer on success, or ERR_PTR() on error.
411 */ 415 */
412struct platform_device *platform_device_register_data( 416struct platform_device *platform_device_register_data(
413 struct device *parent, 417 struct device *parent,
@@ -559,6 +563,8 @@ EXPORT_SYMBOL_GPL(platform_driver_probe);
559 * 563 *
560 * Use this in legacy-style modules that probe hardware directly and 564 * Use this in legacy-style modules that probe hardware directly and
561 * register a single platform device and corresponding platform driver. 565 * register a single platform device and corresponding platform driver.
566 *
567 * Returns &struct platform_device pointer on success, or ERR_PTR() on error.
562 */ 568 */
563struct platform_device * __init_or_module platform_create_bundle( 569struct platform_device * __init_or_module platform_create_bundle(
564 struct platform_driver *driver, 570 struct platform_driver *driver,
@@ -1052,9 +1058,11 @@ static __initdata LIST_HEAD(early_platform_driver_list);
1052static __initdata LIST_HEAD(early_platform_device_list); 1058static __initdata LIST_HEAD(early_platform_device_list);
1053 1059
1054/** 1060/**
1055 * early_platform_driver_register 1061 * early_platform_driver_register - register early platform driver
1056 * @epdrv: early_platform driver structure 1062 * @epdrv: early_platform driver structure
1057 * @buf: string passed from early_param() 1063 * @buf: string passed from early_param()
1064 *
1065 * Helper function for early_platform_init() / early_platform_init_buffer()
1058 */ 1066 */
1059int __init early_platform_driver_register(struct early_platform_driver *epdrv, 1067int __init early_platform_driver_register(struct early_platform_driver *epdrv,
1060 char *buf) 1068 char *buf)
@@ -1106,9 +1114,12 @@ int __init early_platform_driver_register(struct early_platform_driver *epdrv,
1106} 1114}
1107 1115
1108/** 1116/**
1109 * early_platform_add_devices - add a numbers of early platform devices 1117 * early_platform_add_devices - adds a number of early platform devices
1110 * @devs: array of early platform devices to add 1118 * @devs: array of early platform devices to add
1111 * @num: number of early platform devices in array 1119 * @num: number of early platform devices in array
1120 *
1121 * Used by early architecture code to register early platform devices and
1122 * their platform data.
1112 */ 1123 */
1113void __init early_platform_add_devices(struct platform_device **devs, int num) 1124void __init early_platform_add_devices(struct platform_device **devs, int num)
1114{ 1125{
@@ -1128,8 +1139,12 @@ void __init early_platform_add_devices(struct platform_device **devs, int num)
1128} 1139}
1129 1140
1130/** 1141/**
1131 * early_platform_driver_register_all 1142 * early_platform_driver_register_all - register early platform drivers
1132 * @class_str: string to identify early platform driver class 1143 * @class_str: string to identify early platform driver class
1144 *
1145 * Used by architecture code to register all early platform drivers
1146 * for a certain class. If omitted then only early platform drivers
1147 * with matching kernel command line class parameters will be registered.
1133 */ 1148 */
1134void __init early_platform_driver_register_all(char *class_str) 1149void __init early_platform_driver_register_all(char *class_str)
1135{ 1150{
@@ -1151,7 +1166,7 @@ void __init early_platform_driver_register_all(char *class_str)
1151} 1166}
1152 1167
1153/** 1168/**
1154 * early_platform_match 1169 * early_platform_match - find early platform device matching driver
1155 * @epdrv: early platform driver structure 1170 * @epdrv: early platform driver structure
1156 * @id: id to match against 1171 * @id: id to match against
1157 */ 1172 */
@@ -1169,7 +1184,7 @@ early_platform_match(struct early_platform_driver *epdrv, int id)
1169} 1184}
1170 1185
1171/** 1186/**
1172 * early_platform_left 1187 * early_platform_left - check if early platform driver has matching devices
1173 * @epdrv: early platform driver structure 1188 * @epdrv: early platform driver structure
1174 * @id: return true if id or above exists 1189 * @id: return true if id or above exists
1175 */ 1190 */
@@ -1187,7 +1202,7 @@ static __init int early_platform_left(struct early_platform_driver *epdrv,
1187} 1202}
1188 1203
1189/** 1204/**
1190 * early_platform_driver_probe_id 1205 * early_platform_driver_probe_id - probe drivers matching class_str and id
1191 * @class_str: string to identify early platform driver class 1206 * @class_str: string to identify early platform driver class
1192 * @id: id to match against 1207 * @id: id to match against
1193 * @nr_probe: number of platform devices to successfully probe before exiting 1208 * @nr_probe: number of platform devices to successfully probe before exiting
@@ -1239,6 +1254,26 @@ static int __init early_platform_driver_probe_id(char *class_str,
1239 } 1254 }
1240 1255
1241 if (match) { 1256 if (match) {
1257 /*
1258 * Set up a sensible init_name to enable
1259 * dev_name() and others to be used before the
1260 * rest of the driver core is initialized.
1261 */
1262 if (!match->dev.init_name) {
1263 if (match->id != -1)
1264 match->dev.init_name =
1265 kasprintf(GFP_KERNEL, "%s.%d",
1266 match->name,
1267 match->id);
1268 else
1269 match->dev.init_name =
1270 kasprintf(GFP_KERNEL, "%s",
1271 match->name);
1272
1273 if (!match->dev.init_name)
1274 return -ENOMEM;
1275 }
1276
1242 if (epdrv->pdrv->probe(match)) 1277 if (epdrv->pdrv->probe(match))
1243 pr_warning("%s: unable to probe %s early.\n", 1278 pr_warning("%s: unable to probe %s early.\n",
1244 class_str, match->name); 1279 class_str, match->name);
@@ -1257,10 +1292,14 @@ static int __init early_platform_driver_probe_id(char *class_str,
1257} 1292}
1258 1293
1259/** 1294/**
1260 * early_platform_driver_probe 1295 * early_platform_driver_probe - probe a class of registered drivers
1261 * @class_str: string to identify early platform driver class 1296 * @class_str: string to identify early platform driver class
1262 * @nr_probe: number of platform devices to successfully probe before exiting 1297 * @nr_probe: number of platform devices to successfully probe before exiting
1263 * @user_only: only probe user specified early platform devices 1298 * @user_only: only probe user specified early platform devices
1299 *
1300 * Used by architecture code to probe registered early platform drivers
1301 * within a certain class. For probe to happen a registered early platform
1302 * device matching a registered early platform driver is needed.
1264 */ 1303 */
1265int __init early_platform_driver_probe(char *class_str, 1304int __init early_platform_driver_probe(char *class_str,
1266 int nr_probe, 1305 int nr_probe,
diff --git a/drivers/char/agp/intel-agp.c b/drivers/char/agp/intel-agp.c
index a3e10dc7cc25..b78d5c381efe 100644
--- a/drivers/char/agp/intel-agp.c
+++ b/drivers/char/agp/intel-agp.c
@@ -97,6 +97,9 @@ EXPORT_SYMBOL(intel_agp_enabled);
97#define IS_PINEVIEW (agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_PINEVIEW_M_HB || \ 97#define IS_PINEVIEW (agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_PINEVIEW_M_HB || \
98 agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_PINEVIEW_HB) 98 agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_PINEVIEW_HB)
99 99
100#define IS_SNB (agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_SANDYBRIDGE_HB || \
101 agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_SANDYBRIDGE_M_HB)
102
100#define IS_G4X (agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_EAGLELAKE_HB || \ 103#define IS_G4X (agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_EAGLELAKE_HB || \
101 agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_Q45_HB || \ 104 agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_Q45_HB || \
102 agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_G45_HB || \ 105 agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_G45_HB || \
@@ -107,8 +110,7 @@ EXPORT_SYMBOL(intel_agp_enabled);
107 agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_IRONLAKE_M_HB || \ 110 agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_IRONLAKE_M_HB || \
108 agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_IRONLAKE_MA_HB || \ 111 agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_IRONLAKE_MA_HB || \
109 agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_IRONLAKE_MC2_HB || \ 112 agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_IRONLAKE_MC2_HB || \
110 agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_SANDYBRIDGE_HB || \ 113 IS_SNB)
111 agp_bridge->dev->device == PCI_DEVICE_ID_INTEL_SANDYBRIDGE_M_HB)
112 114
113extern int agp_memory_reserved; 115extern int agp_memory_reserved;
114 116
@@ -175,6 +177,10 @@ extern int agp_memory_reserved;
175#define SNB_GMCH_GMS_STOLEN_448M (0xe << 3) 177#define SNB_GMCH_GMS_STOLEN_448M (0xe << 3)
176#define SNB_GMCH_GMS_STOLEN_480M (0xf << 3) 178#define SNB_GMCH_GMS_STOLEN_480M (0xf << 3)
177#define SNB_GMCH_GMS_STOLEN_512M (0x10 << 3) 179#define SNB_GMCH_GMS_STOLEN_512M (0x10 << 3)
180#define SNB_GTT_SIZE_0M (0 << 8)
181#define SNB_GTT_SIZE_1M (1 << 8)
182#define SNB_GTT_SIZE_2M (2 << 8)
183#define SNB_GTT_SIZE_MASK (3 << 8)
178 184
179static const struct aper_size_info_fixed intel_i810_sizes[] = 185static const struct aper_size_info_fixed intel_i810_sizes[] =
180{ 186{
@@ -1200,6 +1206,9 @@ static void intel_i9xx_setup_flush(void)
1200 if (intel_private.ifp_resource.start) 1206 if (intel_private.ifp_resource.start)
1201 return; 1207 return;
1202 1208
1209 if (IS_SNB)
1210 return;
1211
1203 /* setup a resource for this object */ 1212 /* setup a resource for this object */
1204 intel_private.ifp_resource.name = "Intel Flush Page"; 1213 intel_private.ifp_resource.name = "Intel Flush Page";
1205 intel_private.ifp_resource.flags = IORESOURCE_MEM; 1214 intel_private.ifp_resource.flags = IORESOURCE_MEM;
@@ -1438,6 +1447,8 @@ static unsigned long intel_i965_mask_memory(struct agp_bridge_data *bridge,
1438 1447
1439static void intel_i965_get_gtt_range(int *gtt_offset, int *gtt_size) 1448static void intel_i965_get_gtt_range(int *gtt_offset, int *gtt_size)
1440{ 1449{
1450 u16 snb_gmch_ctl;
1451
1441 switch (agp_bridge->dev->device) { 1452 switch (agp_bridge->dev->device) {
1442 case PCI_DEVICE_ID_INTEL_GM45_HB: 1453 case PCI_DEVICE_ID_INTEL_GM45_HB:
1443 case PCI_DEVICE_ID_INTEL_EAGLELAKE_HB: 1454 case PCI_DEVICE_ID_INTEL_EAGLELAKE_HB:
@@ -1449,9 +1460,26 @@ static void intel_i965_get_gtt_range(int *gtt_offset, int *gtt_size)
1449 case PCI_DEVICE_ID_INTEL_IRONLAKE_M_HB: 1460 case PCI_DEVICE_ID_INTEL_IRONLAKE_M_HB:
1450 case PCI_DEVICE_ID_INTEL_IRONLAKE_MA_HB: 1461 case PCI_DEVICE_ID_INTEL_IRONLAKE_MA_HB:
1451 case PCI_DEVICE_ID_INTEL_IRONLAKE_MC2_HB: 1462 case PCI_DEVICE_ID_INTEL_IRONLAKE_MC2_HB:
1463 *gtt_offset = *gtt_size = MB(2);
1464 break;
1452 case PCI_DEVICE_ID_INTEL_SANDYBRIDGE_HB: 1465 case PCI_DEVICE_ID_INTEL_SANDYBRIDGE_HB:
1453 case PCI_DEVICE_ID_INTEL_SANDYBRIDGE_M_HB: 1466 case PCI_DEVICE_ID_INTEL_SANDYBRIDGE_M_HB:
1454 *gtt_offset = *gtt_size = MB(2); 1467 *gtt_offset = MB(2);
1468
1469 pci_read_config_word(intel_private.pcidev, SNB_GMCH_CTRL, &snb_gmch_ctl);
1470 switch (snb_gmch_ctl & SNB_GTT_SIZE_MASK) {
1471 default:
1472 case SNB_GTT_SIZE_0M:
1473 printk(KERN_ERR "Bad GTT size mask: 0x%04x.\n", snb_gmch_ctl);
1474 *gtt_size = MB(0);
1475 break;
1476 case SNB_GTT_SIZE_1M:
1477 *gtt_size = MB(1);
1478 break;
1479 case SNB_GTT_SIZE_2M:
1480 *gtt_size = MB(2);
1481 break;
1482 }
1455 break; 1483 break;
1456 default: 1484 default:
1457 *gtt_offset = *gtt_size = KB(512); 1485 *gtt_offset = *gtt_size = KB(512);
diff --git a/drivers/char/hvc_console.c b/drivers/char/hvc_console.c
index 465185fc0f52..ba55bba151b9 100644
--- a/drivers/char/hvc_console.c
+++ b/drivers/char/hvc_console.c
@@ -312,6 +312,7 @@ static int hvc_open(struct tty_struct *tty, struct file * filp)
312 spin_lock_irqsave(&hp->lock, flags); 312 spin_lock_irqsave(&hp->lock, flags);
313 /* Check and then increment for fast path open. */ 313 /* Check and then increment for fast path open. */
314 if (hp->count++ > 0) { 314 if (hp->count++ > 0) {
315 tty_kref_get(tty);
315 spin_unlock_irqrestore(&hp->lock, flags); 316 spin_unlock_irqrestore(&hp->lock, flags);
316 hvc_kick(); 317 hvc_kick();
317 return 0; 318 return 0;
@@ -319,7 +320,7 @@ static int hvc_open(struct tty_struct *tty, struct file * filp)
319 320
320 tty->driver_data = hp; 321 tty->driver_data = hp;
321 322
322 hp->tty = tty; 323 hp->tty = tty_kref_get(tty);
323 324
324 spin_unlock_irqrestore(&hp->lock, flags); 325 spin_unlock_irqrestore(&hp->lock, flags);
325 326
@@ -336,6 +337,7 @@ static int hvc_open(struct tty_struct *tty, struct file * filp)
336 spin_lock_irqsave(&hp->lock, flags); 337 spin_lock_irqsave(&hp->lock, flags);
337 hp->tty = NULL; 338 hp->tty = NULL;
338 spin_unlock_irqrestore(&hp->lock, flags); 339 spin_unlock_irqrestore(&hp->lock, flags);
340 tty_kref_put(tty);
339 tty->driver_data = NULL; 341 tty->driver_data = NULL;
340 kref_put(&hp->kref, destroy_hvc_struct); 342 kref_put(&hp->kref, destroy_hvc_struct);
341 printk(KERN_ERR "hvc_open: request_irq failed with rc %d.\n", rc); 343 printk(KERN_ERR "hvc_open: request_irq failed with rc %d.\n", rc);
@@ -363,13 +365,18 @@ static void hvc_close(struct tty_struct *tty, struct file * filp)
363 return; 365 return;
364 366
365 hp = tty->driver_data; 367 hp = tty->driver_data;
368
366 spin_lock_irqsave(&hp->lock, flags); 369 spin_lock_irqsave(&hp->lock, flags);
370 tty_kref_get(tty);
367 371
368 if (--hp->count == 0) { 372 if (--hp->count == 0) {
369 /* We are done with the tty pointer now. */ 373 /* We are done with the tty pointer now. */
370 hp->tty = NULL; 374 hp->tty = NULL;
371 spin_unlock_irqrestore(&hp->lock, flags); 375 spin_unlock_irqrestore(&hp->lock, flags);
372 376
377 /* Put the ref obtained in hvc_open() */
378 tty_kref_put(tty);
379
373 if (hp->ops->notifier_del) 380 if (hp->ops->notifier_del)
374 hp->ops->notifier_del(hp, hp->data); 381 hp->ops->notifier_del(hp, hp->data);
375 382
@@ -389,6 +396,7 @@ static void hvc_close(struct tty_struct *tty, struct file * filp)
389 spin_unlock_irqrestore(&hp->lock, flags); 396 spin_unlock_irqrestore(&hp->lock, flags);
390 } 397 }
391 398
399 tty_kref_put(tty);
392 kref_put(&hp->kref, destroy_hvc_struct); 400 kref_put(&hp->kref, destroy_hvc_struct);
393} 401}
394 402
@@ -424,10 +432,11 @@ static void hvc_hangup(struct tty_struct *tty)
424 spin_unlock_irqrestore(&hp->lock, flags); 432 spin_unlock_irqrestore(&hp->lock, flags);
425 433
426 if (hp->ops->notifier_hangup) 434 if (hp->ops->notifier_hangup)
427 hp->ops->notifier_hangup(hp, hp->data); 435 hp->ops->notifier_hangup(hp, hp->data);
428 436
429 while(temp_open_count) { 437 while(temp_open_count) {
430 --temp_open_count; 438 --temp_open_count;
439 tty_kref_put(tty);
431 kref_put(&hp->kref, destroy_hvc_struct); 440 kref_put(&hp->kref, destroy_hvc_struct);
432 } 441 }
433} 442}
@@ -592,7 +601,7 @@ int hvc_poll(struct hvc_struct *hp)
592 } 601 }
593 602
594 /* No tty attached, just skip */ 603 /* No tty attached, just skip */
595 tty = hp->tty; 604 tty = tty_kref_get(hp->tty);
596 if (tty == NULL) 605 if (tty == NULL)
597 goto bail; 606 goto bail;
598 607
@@ -672,6 +681,8 @@ int hvc_poll(struct hvc_struct *hp)
672 681
673 tty_flip_buffer_push(tty); 682 tty_flip_buffer_push(tty);
674 } 683 }
684 if (tty)
685 tty_kref_put(tty);
675 686
676 return poll_mask; 687 return poll_mask;
677} 688}
@@ -807,7 +818,7 @@ int hvc_remove(struct hvc_struct *hp)
807 struct tty_struct *tty; 818 struct tty_struct *tty;
808 819
809 spin_lock_irqsave(&hp->lock, flags); 820 spin_lock_irqsave(&hp->lock, flags);
810 tty = hp->tty; 821 tty = tty_kref_get(hp->tty);
811 822
812 if (hp->index < MAX_NR_HVC_CONSOLES) 823 if (hp->index < MAX_NR_HVC_CONSOLES)
813 vtermnos[hp->index] = -1; 824 vtermnos[hp->index] = -1;
@@ -819,18 +830,18 @@ int hvc_remove(struct hvc_struct *hp)
819 /* 830 /*
820 * We 'put' the instance that was grabbed when the kref instance 831 * We 'put' the instance that was grabbed when the kref instance
821 * was initialized using kref_init(). Let the last holder of this 832 * was initialized using kref_init(). Let the last holder of this
822 * kref cause it to be removed, which will probably be the tty_hangup 833 * kref cause it to be removed, which will probably be the tty_vhangup
823 * below. 834 * below.
824 */ 835 */
825 kref_put(&hp->kref, destroy_hvc_struct); 836 kref_put(&hp->kref, destroy_hvc_struct);
826 837
827 /* 838 /*
828 * This function call will auto chain call hvc_hangup. The tty should 839 * This function call will auto chain call hvc_hangup.
829 * always be valid at this time unless a simultaneous tty close already
830 * cleaned up the hvc_struct.
831 */ 840 */
832 if (tty) 841 if (tty) {
833 tty_hangup(tty); 842 tty_vhangup(tty);
843 tty_kref_put(tty);
844 }
834 return 0; 845 return 0;
835} 846}
836EXPORT_SYMBOL_GPL(hvc_remove); 847EXPORT_SYMBOL_GPL(hvc_remove);
diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c
index ec5e3f8df648..c6ad4234378d 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -2272,42 +2272,52 @@ static int create_files(struct bmc_device *bmc)
2272 bmc->device_id_attr.attr.name = "device_id"; 2272 bmc->device_id_attr.attr.name = "device_id";
2273 bmc->device_id_attr.attr.mode = S_IRUGO; 2273 bmc->device_id_attr.attr.mode = S_IRUGO;
2274 bmc->device_id_attr.show = device_id_show; 2274 bmc->device_id_attr.show = device_id_show;
2275 sysfs_attr_init(&bmc->device_id_attr.attr);
2275 2276
2276 bmc->provides_dev_sdrs_attr.attr.name = "provides_device_sdrs"; 2277 bmc->provides_dev_sdrs_attr.attr.name = "provides_device_sdrs";
2277 bmc->provides_dev_sdrs_attr.attr.mode = S_IRUGO; 2278 bmc->provides_dev_sdrs_attr.attr.mode = S_IRUGO;
2278 bmc->provides_dev_sdrs_attr.show = provides_dev_sdrs_show; 2279 bmc->provides_dev_sdrs_attr.show = provides_dev_sdrs_show;
2280 sysfs_attr_init(&bmc->provides_dev_sdrs_attr.attr);
2279 2281
2280 bmc->revision_attr.attr.name = "revision"; 2282 bmc->revision_attr.attr.name = "revision";
2281 bmc->revision_attr.attr.mode = S_IRUGO; 2283 bmc->revision_attr.attr.mode = S_IRUGO;
2282 bmc->revision_attr.show = revision_show; 2284 bmc->revision_attr.show = revision_show;
2285 sysfs_attr_init(&bmc->revision_attr.attr);
2283 2286
2284 bmc->firmware_rev_attr.attr.name = "firmware_revision"; 2287 bmc->firmware_rev_attr.attr.name = "firmware_revision";
2285 bmc->firmware_rev_attr.attr.mode = S_IRUGO; 2288 bmc->firmware_rev_attr.attr.mode = S_IRUGO;
2286 bmc->firmware_rev_attr.show = firmware_rev_show; 2289 bmc->firmware_rev_attr.show = firmware_rev_show;
2290 sysfs_attr_init(&bmc->firmware_rev_attr.attr);
2287 2291
2288 bmc->version_attr.attr.name = "ipmi_version"; 2292 bmc->version_attr.attr.name = "ipmi_version";
2289 bmc->version_attr.attr.mode = S_IRUGO; 2293 bmc->version_attr.attr.mode = S_IRUGO;
2290 bmc->version_attr.show = ipmi_version_show; 2294 bmc->version_attr.show = ipmi_version_show;
2295 sysfs_attr_init(&bmc->version_attr.attr);
2291 2296
2292 bmc->add_dev_support_attr.attr.name = "additional_device_support"; 2297 bmc->add_dev_support_attr.attr.name = "additional_device_support";
2293 bmc->add_dev_support_attr.attr.mode = S_IRUGO; 2298 bmc->add_dev_support_attr.attr.mode = S_IRUGO;
2294 bmc->add_dev_support_attr.show = add_dev_support_show; 2299 bmc->add_dev_support_attr.show = add_dev_support_show;
2300 sysfs_attr_init(&bmc->add_dev_support_attr.attr);
2295 2301
2296 bmc->manufacturer_id_attr.attr.name = "manufacturer_id"; 2302 bmc->manufacturer_id_attr.attr.name = "manufacturer_id";
2297 bmc->manufacturer_id_attr.attr.mode = S_IRUGO; 2303 bmc->manufacturer_id_attr.attr.mode = S_IRUGO;
2298 bmc->manufacturer_id_attr.show = manufacturer_id_show; 2304 bmc->manufacturer_id_attr.show = manufacturer_id_show;
2305 sysfs_attr_init(&bmc->manufacturer_id_attr.attr);
2299 2306
2300 bmc->product_id_attr.attr.name = "product_id"; 2307 bmc->product_id_attr.attr.name = "product_id";
2301 bmc->product_id_attr.attr.mode = S_IRUGO; 2308 bmc->product_id_attr.attr.mode = S_IRUGO;
2302 bmc->product_id_attr.show = product_id_show; 2309 bmc->product_id_attr.show = product_id_show;
2310 sysfs_attr_init(&bmc->product_id_attr.attr);
2303 2311
2304 bmc->guid_attr.attr.name = "guid"; 2312 bmc->guid_attr.attr.name = "guid";
2305 bmc->guid_attr.attr.mode = S_IRUGO; 2313 bmc->guid_attr.attr.mode = S_IRUGO;
2306 bmc->guid_attr.show = guid_show; 2314 bmc->guid_attr.show = guid_show;
2315 sysfs_attr_init(&bmc->guid_attr.attr);
2307 2316
2308 bmc->aux_firmware_rev_attr.attr.name = "aux_firmware_revision"; 2317 bmc->aux_firmware_rev_attr.attr.name = "aux_firmware_revision";
2309 bmc->aux_firmware_rev_attr.attr.mode = S_IRUGO; 2318 bmc->aux_firmware_rev_attr.attr.mode = S_IRUGO;
2310 bmc->aux_firmware_rev_attr.show = aux_firmware_rev_show; 2319 bmc->aux_firmware_rev_attr.show = aux_firmware_rev_show;
2320 sysfs_attr_init(&bmc->aux_firmware_rev_attr.attr);
2311 2321
2312 err = device_create_file(&bmc->dev->dev, 2322 err = device_create_file(&bmc->dev->dev,
2313 &bmc->device_id_attr); 2323 &bmc->device_id_attr);
diff --git a/drivers/char/tty_buffer.c b/drivers/char/tty_buffer.c
index af8d97715728..7ee52164d474 100644
--- a/drivers/char/tty_buffer.c
+++ b/drivers/char/tty_buffer.c
@@ -248,7 +248,7 @@ int tty_insert_flip_string_fixed_flag(struct tty_struct *tty,
248{ 248{
249 int copied = 0; 249 int copied = 0;
250 do { 250 do {
251 int goal = min(size - copied, TTY_BUFFER_PAGE); 251 int goal = min_t(size_t, size - copied, TTY_BUFFER_PAGE);
252 int space = tty_buffer_request_room(tty, goal); 252 int space = tty_buffer_request_room(tty, goal);
253 struct tty_buffer *tb = tty->buf.tail; 253 struct tty_buffer *tb = tty->buf.tail;
254 /* If there is no space then tb may be NULL */ 254 /* If there is no space then tb may be NULL */
@@ -285,7 +285,7 @@ int tty_insert_flip_string_flags(struct tty_struct *tty,
285{ 285{
286 int copied = 0; 286 int copied = 0;
287 do { 287 do {
288 int goal = min(size - copied, TTY_BUFFER_PAGE); 288 int goal = min_t(size_t, size - copied, TTY_BUFFER_PAGE);
289 int space = tty_buffer_request_room(tty, goal); 289 int space = tty_buffer_request_room(tty, goal);
290 struct tty_buffer *tb = tty->buf.tail; 290 struct tty_buffer *tb = tty->buf.tail;
291 /* If there is no space then tb may be NULL */ 291 /* If there is no space then tb may be NULL */
diff --git a/drivers/char/tty_port.c b/drivers/char/tty_port.c
index be492dd66437..a3bd1d0b66cf 100644
--- a/drivers/char/tty_port.c
+++ b/drivers/char/tty_port.c
@@ -119,7 +119,7 @@ EXPORT_SYMBOL(tty_port_tty_set);
119static void tty_port_shutdown(struct tty_port *port) 119static void tty_port_shutdown(struct tty_port *port)
120{ 120{
121 mutex_lock(&port->mutex); 121 mutex_lock(&port->mutex);
122 if (port->ops->shutdown && 122 if (port->ops->shutdown && !port->console &&
123 test_and_clear_bit(ASYNCB_INITIALIZED, &port->flags)) 123 test_and_clear_bit(ASYNCB_INITIALIZED, &port->flags))
124 port->ops->shutdown(port); 124 port->ops->shutdown(port);
125 mutex_unlock(&port->mutex); 125 mutex_unlock(&port->mutex);
diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
index f404ccfc9c20..44288ce0cb45 100644
--- a/drivers/char/virtio_console.c
+++ b/drivers/char/virtio_console.c
@@ -681,6 +681,10 @@ static void resize_console(struct port *port)
681 struct virtio_device *vdev; 681 struct virtio_device *vdev;
682 struct winsize ws; 682 struct winsize ws;
683 683
684 /* The port could have been hot-unplugged */
685 if (!port)
686 return;
687
684 vdev = port->portdev->vdev; 688 vdev = port->portdev->vdev;
685 if (virtio_has_feature(vdev, VIRTIO_CONSOLE_F_SIZE)) { 689 if (virtio_has_feature(vdev, VIRTIO_CONSOLE_F_SIZE)) {
686 vdev->config->get(vdev, 690 vdev->config->get(vdev,
@@ -947,11 +951,18 @@ static void handle_control_message(struct ports_device *portdev,
947 */ 951 */
948 err = sysfs_create_group(&port->dev->kobj, 952 err = sysfs_create_group(&port->dev->kobj,
949 &port_attribute_group); 953 &port_attribute_group);
950 if (err) 954 if (err) {
951 dev_err(port->dev, 955 dev_err(port->dev,
952 "Error %d creating sysfs device attributes\n", 956 "Error %d creating sysfs device attributes\n",
953 err); 957 err);
954 958 } else {
959 /*
960 * Generate a udev event so that appropriate
961 * symlinks can be created based on udev
962 * rules.
963 */
964 kobject_uevent(&port->dev->kobj, KOBJ_CHANGE);
965 }
955 break; 966 break;
956 case VIRTIO_CONSOLE_PORT_REMOVE: 967 case VIRTIO_CONSOLE_PORT_REMOVE:
957 /* 968 /*
diff --git a/drivers/char/vt_ioctl.c b/drivers/char/vt_ioctl.c
index 87778dcf8727..6aa10284104a 100644
--- a/drivers/char/vt_ioctl.c
+++ b/drivers/char/vt_ioctl.c
@@ -888,7 +888,7 @@ int vt_ioctl(struct tty_struct *tty, struct file * file,
888 ret = -EFAULT; 888 ret = -EFAULT;
889 goto out; 889 goto out;
890 } 890 }
891 if (tmp.mode != VT_AUTO && tmp.mode != VT_PROCESS && tmp.mode != VT_PROCESS_AUTO) { 891 if (tmp.mode != VT_AUTO && tmp.mode != VT_PROCESS) {
892 ret = -EINVAL; 892 ret = -EINVAL;
893 goto out; 893 goto out;
894 } 894 }
@@ -1622,7 +1622,7 @@ static void complete_change_console(struct vc_data *vc)
1622 * telling it that it has acquired. Also check if it has died and 1622 * telling it that it has acquired. Also check if it has died and
1623 * clean up (similar to logic employed in change_console()) 1623 * clean up (similar to logic employed in change_console())
1624 */ 1624 */
1625 if (vc->vt_mode.mode == VT_PROCESS || vc->vt_mode.mode == VT_PROCESS_AUTO) { 1625 if (vc->vt_mode.mode == VT_PROCESS) {
1626 /* 1626 /*
1627 * Send the signal as privileged - kill_pid() will 1627 * Send the signal as privileged - kill_pid() will
1628 * tell us if the process has gone or something else 1628 * tell us if the process has gone or something else
@@ -1682,7 +1682,7 @@ void change_console(struct vc_data *new_vc)
1682 * vt to auto control. 1682 * vt to auto control.
1683 */ 1683 */
1684 vc = vc_cons[fg_console].d; 1684 vc = vc_cons[fg_console].d;
1685 if (vc->vt_mode.mode == VT_PROCESS || vc->vt_mode.mode == VT_PROCESS_AUTO) { 1685 if (vc->vt_mode.mode == VT_PROCESS) {
1686 /* 1686 /*
1687 * Send the signal as privileged - kill_pid() will 1687 * Send the signal as privileged - kill_pid() will
1688 * tell us if the process has gone or something else 1688 * tell us if the process has gone or something else
@@ -1693,28 +1693,27 @@ void change_console(struct vc_data *new_vc)
1693 */ 1693 */
1694 vc->vt_newvt = new_vc->vc_num; 1694 vc->vt_newvt = new_vc->vc_num;
1695 if (kill_pid(vc->vt_pid, vc->vt_mode.relsig, 1) == 0) { 1695 if (kill_pid(vc->vt_pid, vc->vt_mode.relsig, 1) == 0) {
1696 if(vc->vt_mode.mode == VT_PROCESS)
1697 /*
1698 * It worked. Mark the vt to switch to and
1699 * return. The process needs to send us a
1700 * VT_RELDISP ioctl to complete the switch.
1701 */
1702 return;
1703 } else {
1704 /* 1696 /*
1705 * The controlling process has died, so we revert back to 1697 * It worked. Mark the vt to switch to and
1706 * normal operation. In this case, we'll also change back 1698 * return. The process needs to send us a
1707 * to KD_TEXT mode. I'm not sure if this is strictly correct 1699 * VT_RELDISP ioctl to complete the switch.
1708 * but it saves the agony when the X server dies and the screen
1709 * remains blanked due to KD_GRAPHICS! It would be nice to do
1710 * this outside of VT_PROCESS but there is no single process
1711 * to account for and tracking tty count may be undesirable.
1712 */ 1700 */
1713 reset_vc(vc); 1701 return;
1714 } 1702 }
1715 1703
1716 /* 1704 /*
1717 * Fall through to normal (VT_AUTO and VT_PROCESS_AUTO) handling of the switch... 1705 * The controlling process has died, so we revert back to
1706 * normal operation. In this case, we'll also change back
1707 * to KD_TEXT mode. I'm not sure if this is strictly correct
1708 * but it saves the agony when the X server dies and the screen
1709 * remains blanked due to KD_GRAPHICS! It would be nice to do
1710 * this outside of VT_PROCESS but there is no single process
1711 * to account for and tracking tty count may be undesirable.
1712 */
1713 reset_vc(vc);
1714
1715 /*
1716 * Fall through to normal (VT_AUTO) handling of the switch...
1718 */ 1717 */
1719 } 1718 }
1720 1719
diff --git a/drivers/clocksource/sh_cmt.c b/drivers/clocksource/sh_cmt.c
index 578595c4425d..c5f66171a713 100644
--- a/drivers/clocksource/sh_cmt.c
+++ b/drivers/clocksource/sh_cmt.c
@@ -149,13 +149,12 @@ static void sh_cmt_start_stop_ch(struct sh_cmt_priv *p, int start)
149 149
150static int sh_cmt_enable(struct sh_cmt_priv *p, unsigned long *rate) 150static int sh_cmt_enable(struct sh_cmt_priv *p, unsigned long *rate)
151{ 151{
152 struct sh_timer_config *cfg = p->pdev->dev.platform_data;
153 int ret; 152 int ret;
154 153
155 /* enable clock */ 154 /* enable clock */
156 ret = clk_enable(p->clk); 155 ret = clk_enable(p->clk);
157 if (ret) { 156 if (ret) {
158 pr_err("sh_cmt: cannot enable clock \"%s\"\n", cfg->clk); 157 dev_err(&p->pdev->dev, "cannot enable clock\n");
159 return ret; 158 return ret;
160 } 159 }
161 160
@@ -278,7 +277,7 @@ static void sh_cmt_clock_event_program_verify(struct sh_cmt_priv *p,
278 delay = 1; 277 delay = 1;
279 278
280 if (!delay) 279 if (!delay)
281 pr_warning("sh_cmt: too long delay\n"); 280 dev_warn(&p->pdev->dev, "too long delay\n");
282 281
283 } while (delay); 282 } while (delay);
284} 283}
@@ -288,7 +287,7 @@ static void sh_cmt_set_next(struct sh_cmt_priv *p, unsigned long delta)
288 unsigned long flags; 287 unsigned long flags;
289 288
290 if (delta > p->max_match_value) 289 if (delta > p->max_match_value)
291 pr_warning("sh_cmt: delta out of range\n"); 290 dev_warn(&p->pdev->dev, "delta out of range\n");
292 291
293 spin_lock_irqsave(&p->lock, flags); 292 spin_lock_irqsave(&p->lock, flags);
294 p->next_match_value = delta; 293 p->next_match_value = delta;
@@ -450,7 +449,7 @@ static int sh_cmt_register_clocksource(struct sh_cmt_priv *p,
450 cs->resume = sh_cmt_clocksource_resume; 449 cs->resume = sh_cmt_clocksource_resume;
451 cs->mask = CLOCKSOURCE_MASK(sizeof(unsigned long) * 8); 450 cs->mask = CLOCKSOURCE_MASK(sizeof(unsigned long) * 8);
452 cs->flags = CLOCK_SOURCE_IS_CONTINUOUS; 451 cs->flags = CLOCK_SOURCE_IS_CONTINUOUS;
453 pr_info("sh_cmt: %s used as clock source\n", cs->name); 452 dev_info(&p->pdev->dev, "used as clock source\n");
454 clocksource_register(cs); 453 clocksource_register(cs);
455 return 0; 454 return 0;
456} 455}
@@ -496,13 +495,11 @@ static void sh_cmt_clock_event_mode(enum clock_event_mode mode,
496 495
497 switch (mode) { 496 switch (mode) {
498 case CLOCK_EVT_MODE_PERIODIC: 497 case CLOCK_EVT_MODE_PERIODIC:
499 pr_info("sh_cmt: %s used for periodic clock events\n", 498 dev_info(&p->pdev->dev, "used for periodic clock events\n");
500 ced->name);
501 sh_cmt_clock_event_start(p, 1); 499 sh_cmt_clock_event_start(p, 1);
502 break; 500 break;
503 case CLOCK_EVT_MODE_ONESHOT: 501 case CLOCK_EVT_MODE_ONESHOT:
504 pr_info("sh_cmt: %s used for oneshot clock events\n", 502 dev_info(&p->pdev->dev, "used for oneshot clock events\n");
505 ced->name);
506 sh_cmt_clock_event_start(p, 0); 503 sh_cmt_clock_event_start(p, 0);
507 break; 504 break;
508 case CLOCK_EVT_MODE_SHUTDOWN: 505 case CLOCK_EVT_MODE_SHUTDOWN:
@@ -543,7 +540,7 @@ static void sh_cmt_register_clockevent(struct sh_cmt_priv *p,
543 ced->set_next_event = sh_cmt_clock_event_next; 540 ced->set_next_event = sh_cmt_clock_event_next;
544 ced->set_mode = sh_cmt_clock_event_mode; 541 ced->set_mode = sh_cmt_clock_event_mode;
545 542
546 pr_info("sh_cmt: %s used for clock events\n", ced->name); 543 dev_info(&p->pdev->dev, "used for clock events\n");
547 clockevents_register_device(ced); 544 clockevents_register_device(ced);
548} 545}
549 546
@@ -600,22 +597,26 @@ static int sh_cmt_setup(struct sh_cmt_priv *p, struct platform_device *pdev)
600 /* map memory, let mapbase point to our channel */ 597 /* map memory, let mapbase point to our channel */
601 p->mapbase = ioremap_nocache(res->start, resource_size(res)); 598 p->mapbase = ioremap_nocache(res->start, resource_size(res));
602 if (p->mapbase == NULL) { 599 if (p->mapbase == NULL) {
603 pr_err("sh_cmt: failed to remap I/O memory\n"); 600 dev_err(&p->pdev->dev, "failed to remap I/O memory\n");
604 goto err0; 601 goto err0;
605 } 602 }
606 603
607 /* request irq using setup_irq() (too early for request_irq()) */ 604 /* request irq using setup_irq() (too early for request_irq()) */
608 p->irqaction.name = cfg->name; 605 p->irqaction.name = dev_name(&p->pdev->dev);
609 p->irqaction.handler = sh_cmt_interrupt; 606 p->irqaction.handler = sh_cmt_interrupt;
610 p->irqaction.dev_id = p; 607 p->irqaction.dev_id = p;
611 p->irqaction.flags = IRQF_DISABLED | IRQF_TIMER | IRQF_IRQPOLL; 608 p->irqaction.flags = IRQF_DISABLED | IRQF_TIMER | IRQF_IRQPOLL;
612 609
613 /* get hold of clock */ 610 /* get hold of clock */
614 p->clk = clk_get(&p->pdev->dev, cfg->clk); 611 p->clk = clk_get(&p->pdev->dev, "cmt_fck");
615 if (IS_ERR(p->clk)) { 612 if (IS_ERR(p->clk)) {
616 pr_err("sh_cmt: cannot get clock \"%s\"\n", cfg->clk); 613 dev_warn(&p->pdev->dev, "using deprecated clock lookup\n");
617 ret = PTR_ERR(p->clk); 614 p->clk = clk_get(&p->pdev->dev, cfg->clk);
618 goto err1; 615 if (IS_ERR(p->clk)) {
616 dev_err(&p->pdev->dev, "cannot get clock\n");
617 ret = PTR_ERR(p->clk);
618 goto err1;
619 }
619 } 620 }
620 621
621 if (resource_size(res) == 6) { 622 if (resource_size(res) == 6) {
@@ -628,17 +629,17 @@ static int sh_cmt_setup(struct sh_cmt_priv *p, struct platform_device *pdev)
628 p->clear_bits = ~0xc000; 629 p->clear_bits = ~0xc000;
629 } 630 }
630 631
631 ret = sh_cmt_register(p, cfg->name, 632 ret = sh_cmt_register(p, (char *)dev_name(&p->pdev->dev),
632 cfg->clockevent_rating, 633 cfg->clockevent_rating,
633 cfg->clocksource_rating); 634 cfg->clocksource_rating);
634 if (ret) { 635 if (ret) {
635 pr_err("sh_cmt: registration failed\n"); 636 dev_err(&p->pdev->dev, "registration failed\n");
636 goto err1; 637 goto err1;
637 } 638 }
638 639
639 ret = setup_irq(irq, &p->irqaction); 640 ret = setup_irq(irq, &p->irqaction);
640 if (ret) { 641 if (ret) {
641 pr_err("sh_cmt: failed to request irq %d\n", irq); 642 dev_err(&p->pdev->dev, "failed to request irq %d\n", irq);
642 goto err1; 643 goto err1;
643 } 644 }
644 645
@@ -653,11 +654,10 @@ err0:
653static int __devinit sh_cmt_probe(struct platform_device *pdev) 654static int __devinit sh_cmt_probe(struct platform_device *pdev)
654{ 655{
655 struct sh_cmt_priv *p = platform_get_drvdata(pdev); 656 struct sh_cmt_priv *p = platform_get_drvdata(pdev);
656 struct sh_timer_config *cfg = pdev->dev.platform_data;
657 int ret; 657 int ret;
658 658
659 if (p) { 659 if (p) {
660 pr_info("sh_cmt: %s kept as earlytimer\n", cfg->name); 660 dev_info(&pdev->dev, "kept as earlytimer\n");
661 return 0; 661 return 0;
662 } 662 }
663 663
diff --git a/drivers/clocksource/sh_mtu2.c b/drivers/clocksource/sh_mtu2.c
index 4c8a759e60cd..b11882e0f1bd 100644
--- a/drivers/clocksource/sh_mtu2.c
+++ b/drivers/clocksource/sh_mtu2.c
@@ -118,13 +118,12 @@ static void sh_mtu2_start_stop_ch(struct sh_mtu2_priv *p, int start)
118 118
119static int sh_mtu2_enable(struct sh_mtu2_priv *p) 119static int sh_mtu2_enable(struct sh_mtu2_priv *p)
120{ 120{
121 struct sh_timer_config *cfg = p->pdev->dev.platform_data;
122 int ret; 121 int ret;
123 122
124 /* enable clock */ 123 /* enable clock */
125 ret = clk_enable(p->clk); 124 ret = clk_enable(p->clk);
126 if (ret) { 125 if (ret) {
127 pr_err("sh_mtu2: cannot enable clock \"%s\"\n", cfg->clk); 126 dev_err(&p->pdev->dev, "cannot enable clock\n");
128 return ret; 127 return ret;
129 } 128 }
130 129
@@ -193,8 +192,7 @@ static void sh_mtu2_clock_event_mode(enum clock_event_mode mode,
193 192
194 switch (mode) { 193 switch (mode) {
195 case CLOCK_EVT_MODE_PERIODIC: 194 case CLOCK_EVT_MODE_PERIODIC:
196 pr_info("sh_mtu2: %s used for periodic clock events\n", 195 dev_info(&p->pdev->dev, "used for periodic clock events\n");
197 ced->name);
198 sh_mtu2_enable(p); 196 sh_mtu2_enable(p);
199 break; 197 break;
200 case CLOCK_EVT_MODE_UNUSED: 198 case CLOCK_EVT_MODE_UNUSED:
@@ -221,13 +219,13 @@ static void sh_mtu2_register_clockevent(struct sh_mtu2_priv *p,
221 ced->cpumask = cpumask_of(0); 219 ced->cpumask = cpumask_of(0);
222 ced->set_mode = sh_mtu2_clock_event_mode; 220 ced->set_mode = sh_mtu2_clock_event_mode;
223 221
224 pr_info("sh_mtu2: %s used for clock events\n", ced->name); 222 dev_info(&p->pdev->dev, "used for clock events\n");
225 clockevents_register_device(ced); 223 clockevents_register_device(ced);
226 224
227 ret = setup_irq(p->irqaction.irq, &p->irqaction); 225 ret = setup_irq(p->irqaction.irq, &p->irqaction);
228 if (ret) { 226 if (ret) {
229 pr_err("sh_mtu2: failed to request irq %d\n", 227 dev_err(&p->pdev->dev, "failed to request irq %d\n",
230 p->irqaction.irq); 228 p->irqaction.irq);
231 return; 229 return;
232 } 230 }
233} 231}
@@ -273,26 +271,31 @@ static int sh_mtu2_setup(struct sh_mtu2_priv *p, struct platform_device *pdev)
273 /* map memory, let mapbase point to our channel */ 271 /* map memory, let mapbase point to our channel */
274 p->mapbase = ioremap_nocache(res->start, resource_size(res)); 272 p->mapbase = ioremap_nocache(res->start, resource_size(res));
275 if (p->mapbase == NULL) { 273 if (p->mapbase == NULL) {
276 pr_err("sh_mtu2: failed to remap I/O memory\n"); 274 dev_err(&p->pdev->dev, "failed to remap I/O memory\n");
277 goto err0; 275 goto err0;
278 } 276 }
279 277
280 /* setup data for setup_irq() (too early for request_irq()) */ 278 /* setup data for setup_irq() (too early for request_irq()) */
281 p->irqaction.name = cfg->name; 279 p->irqaction.name = dev_name(&p->pdev->dev);
282 p->irqaction.handler = sh_mtu2_interrupt; 280 p->irqaction.handler = sh_mtu2_interrupt;
283 p->irqaction.dev_id = p; 281 p->irqaction.dev_id = p;
284 p->irqaction.irq = irq; 282 p->irqaction.irq = irq;
285 p->irqaction.flags = IRQF_DISABLED | IRQF_TIMER | IRQF_IRQPOLL; 283 p->irqaction.flags = IRQF_DISABLED | IRQF_TIMER | IRQF_IRQPOLL;
286 284
287 /* get hold of clock */ 285 /* get hold of clock */
288 p->clk = clk_get(&p->pdev->dev, cfg->clk); 286 p->clk = clk_get(&p->pdev->dev, "mtu2_fck");
289 if (IS_ERR(p->clk)) { 287 if (IS_ERR(p->clk)) {
290 pr_err("sh_mtu2: cannot get clock \"%s\"\n", cfg->clk); 288 dev_warn(&p->pdev->dev, "using deprecated clock lookup\n");
291 ret = PTR_ERR(p->clk); 289 p->clk = clk_get(&p->pdev->dev, cfg->clk);
292 goto err1; 290 if (IS_ERR(p->clk)) {
291 dev_err(&p->pdev->dev, "cannot get clock\n");
292 ret = PTR_ERR(p->clk);
293 goto err1;
294 }
293 } 295 }
294 296
295 return sh_mtu2_register(p, cfg->name, cfg->clockevent_rating); 297 return sh_mtu2_register(p, (char *)dev_name(&p->pdev->dev),
298 cfg->clockevent_rating);
296 err1: 299 err1:
297 iounmap(p->mapbase); 300 iounmap(p->mapbase);
298 err0: 301 err0:
@@ -302,11 +305,10 @@ static int sh_mtu2_setup(struct sh_mtu2_priv *p, struct platform_device *pdev)
302static int __devinit sh_mtu2_probe(struct platform_device *pdev) 305static int __devinit sh_mtu2_probe(struct platform_device *pdev)
303{ 306{
304 struct sh_mtu2_priv *p = platform_get_drvdata(pdev); 307 struct sh_mtu2_priv *p = platform_get_drvdata(pdev);
305 struct sh_timer_config *cfg = pdev->dev.platform_data;
306 int ret; 308 int ret;
307 309
308 if (p) { 310 if (p) {
309 pr_info("sh_mtu2: %s kept as earlytimer\n", cfg->name); 311 dev_info(&pdev->dev, "kept as earlytimer\n");
310 return 0; 312 return 0;
311 } 313 }
312 314
diff --git a/drivers/clocksource/sh_tmu.c b/drivers/clocksource/sh_tmu.c
index 961f5b5ef6a3..6b62283c1aba 100644
--- a/drivers/clocksource/sh_tmu.c
+++ b/drivers/clocksource/sh_tmu.c
@@ -106,13 +106,12 @@ static void sh_tmu_start_stop_ch(struct sh_tmu_priv *p, int start)
106 106
107static int sh_tmu_enable(struct sh_tmu_priv *p) 107static int sh_tmu_enable(struct sh_tmu_priv *p)
108{ 108{
109 struct sh_timer_config *cfg = p->pdev->dev.platform_data;
110 int ret; 109 int ret;
111 110
112 /* enable clock */ 111 /* enable clock */
113 ret = clk_enable(p->clk); 112 ret = clk_enable(p->clk);
114 if (ret) { 113 if (ret) {
115 pr_err("sh_tmu: cannot enable clock \"%s\"\n", cfg->clk); 114 dev_err(&p->pdev->dev, "cannot enable clock\n");
116 return ret; 115 return ret;
117 } 116 }
118 117
@@ -228,7 +227,7 @@ static int sh_tmu_register_clocksource(struct sh_tmu_priv *p,
228 cs->disable = sh_tmu_clocksource_disable; 227 cs->disable = sh_tmu_clocksource_disable;
229 cs->mask = CLOCKSOURCE_MASK(32); 228 cs->mask = CLOCKSOURCE_MASK(32);
230 cs->flags = CLOCK_SOURCE_IS_CONTINUOUS; 229 cs->flags = CLOCK_SOURCE_IS_CONTINUOUS;
231 pr_info("sh_tmu: %s used as clock source\n", cs->name); 230 dev_info(&p->pdev->dev, "used as clock source\n");
232 clocksource_register(cs); 231 clocksource_register(cs);
233 return 0; 232 return 0;
234} 233}
@@ -276,13 +275,11 @@ static void sh_tmu_clock_event_mode(enum clock_event_mode mode,
276 275
277 switch (mode) { 276 switch (mode) {
278 case CLOCK_EVT_MODE_PERIODIC: 277 case CLOCK_EVT_MODE_PERIODIC:
279 pr_info("sh_tmu: %s used for periodic clock events\n", 278 dev_info(&p->pdev->dev, "used for periodic clock events\n");
280 ced->name);
281 sh_tmu_clock_event_start(p, 1); 279 sh_tmu_clock_event_start(p, 1);
282 break; 280 break;
283 case CLOCK_EVT_MODE_ONESHOT: 281 case CLOCK_EVT_MODE_ONESHOT:
284 pr_info("sh_tmu: %s used for oneshot clock events\n", 282 dev_info(&p->pdev->dev, "used for oneshot clock events\n");
285 ced->name);
286 sh_tmu_clock_event_start(p, 0); 283 sh_tmu_clock_event_start(p, 0);
287 break; 284 break;
288 case CLOCK_EVT_MODE_UNUSED: 285 case CLOCK_EVT_MODE_UNUSED:
@@ -323,13 +320,13 @@ static void sh_tmu_register_clockevent(struct sh_tmu_priv *p,
323 ced->set_next_event = sh_tmu_clock_event_next; 320 ced->set_next_event = sh_tmu_clock_event_next;
324 ced->set_mode = sh_tmu_clock_event_mode; 321 ced->set_mode = sh_tmu_clock_event_mode;
325 322
326 pr_info("sh_tmu: %s used for clock events\n", ced->name); 323 dev_info(&p->pdev->dev, "used for clock events\n");
327 clockevents_register_device(ced); 324 clockevents_register_device(ced);
328 325
329 ret = setup_irq(p->irqaction.irq, &p->irqaction); 326 ret = setup_irq(p->irqaction.irq, &p->irqaction);
330 if (ret) { 327 if (ret) {
331 pr_err("sh_tmu: failed to request irq %d\n", 328 dev_err(&p->pdev->dev, "failed to request irq %d\n",
332 p->irqaction.irq); 329 p->irqaction.irq);
333 return; 330 return;
334 } 331 }
335} 332}
@@ -378,26 +375,30 @@ static int sh_tmu_setup(struct sh_tmu_priv *p, struct platform_device *pdev)
378 /* map memory, let mapbase point to our channel */ 375 /* map memory, let mapbase point to our channel */
379 p->mapbase = ioremap_nocache(res->start, resource_size(res)); 376 p->mapbase = ioremap_nocache(res->start, resource_size(res));
380 if (p->mapbase == NULL) { 377 if (p->mapbase == NULL) {
381 pr_err("sh_tmu: failed to remap I/O memory\n"); 378 dev_err(&p->pdev->dev, "failed to remap I/O memory\n");
382 goto err0; 379 goto err0;
383 } 380 }
384 381
385 /* setup data for setup_irq() (too early for request_irq()) */ 382 /* setup data for setup_irq() (too early for request_irq()) */
386 p->irqaction.name = cfg->name; 383 p->irqaction.name = dev_name(&p->pdev->dev);
387 p->irqaction.handler = sh_tmu_interrupt; 384 p->irqaction.handler = sh_tmu_interrupt;
388 p->irqaction.dev_id = p; 385 p->irqaction.dev_id = p;
389 p->irqaction.irq = irq; 386 p->irqaction.irq = irq;
390 p->irqaction.flags = IRQF_DISABLED | IRQF_TIMER | IRQF_IRQPOLL; 387 p->irqaction.flags = IRQF_DISABLED | IRQF_TIMER | IRQF_IRQPOLL;
391 388
392 /* get hold of clock */ 389 /* get hold of clock */
393 p->clk = clk_get(&p->pdev->dev, cfg->clk); 390 p->clk = clk_get(&p->pdev->dev, "tmu_fck");
394 if (IS_ERR(p->clk)) { 391 if (IS_ERR(p->clk)) {
395 pr_err("sh_tmu: cannot get clock \"%s\"\n", cfg->clk); 392 dev_warn(&p->pdev->dev, "using deprecated clock lookup\n");
396 ret = PTR_ERR(p->clk); 393 p->clk = clk_get(&p->pdev->dev, cfg->clk);
397 goto err1; 394 if (IS_ERR(p->clk)) {
395 dev_err(&p->pdev->dev, "cannot get clock\n");
396 ret = PTR_ERR(p->clk);
397 goto err1;
398 }
398 } 399 }
399 400
400 return sh_tmu_register(p, cfg->name, 401 return sh_tmu_register(p, (char *)dev_name(&p->pdev->dev),
401 cfg->clockevent_rating, 402 cfg->clockevent_rating,
402 cfg->clocksource_rating); 403 cfg->clocksource_rating);
403 err1: 404 err1:
@@ -409,11 +410,10 @@ static int sh_tmu_setup(struct sh_tmu_priv *p, struct platform_device *pdev)
409static int __devinit sh_tmu_probe(struct platform_device *pdev) 410static int __devinit sh_tmu_probe(struct platform_device *pdev)
410{ 411{
411 struct sh_tmu_priv *p = platform_get_drvdata(pdev); 412 struct sh_tmu_priv *p = platform_get_drvdata(pdev);
412 struct sh_timer_config *cfg = pdev->dev.platform_data;
413 int ret; 413 int ret;
414 414
415 if (p) { 415 if (p) {
416 pr_info("sh_tmu: %s kept as earlytimer\n", cfg->name); 416 dev_info(&pdev->dev, "kept as earlytimer\n");
417 return 0; 417 return 0;
418 } 418 }
419 419
diff --git a/drivers/dma/shdma.c b/drivers/dma/shdma.c
index 5d17e09cb625..7a18b580f626 100644
--- a/drivers/dma/shdma.c
+++ b/drivers/dma/shdma.c
@@ -25,8 +25,7 @@
25#include <linux/dma-mapping.h> 25#include <linux/dma-mapping.h>
26#include <linux/platform_device.h> 26#include <linux/platform_device.h>
27#include <linux/pm_runtime.h> 27#include <linux/pm_runtime.h>
28 28#include <linux/sh_dma.h>
29#include <asm/dmaengine.h>
30 29
31#include "shdma.h" 30#include "shdma.h"
32 31
@@ -44,7 +43,7 @@ enum sh_dmae_desc_status {
44#define LOG2_DEFAULT_XFER_SIZE 2 43#define LOG2_DEFAULT_XFER_SIZE 2
45 44
46/* A bitmask with bits enough for enum sh_dmae_slave_chan_id */ 45/* A bitmask with bits enough for enum sh_dmae_slave_chan_id */
47static unsigned long sh_dmae_slave_used[BITS_TO_LONGS(SHDMA_SLAVE_NUMBER)]; 46static unsigned long sh_dmae_slave_used[BITS_TO_LONGS(SH_DMA_SLAVE_NUMBER)];
48 47
49static void sh_dmae_chan_ld_cleanup(struct sh_dmae_chan *sh_chan, bool all); 48static void sh_dmae_chan_ld_cleanup(struct sh_dmae_chan *sh_chan, bool all);
50 49
@@ -266,7 +265,7 @@ static struct sh_desc *sh_dmae_get_desc(struct sh_dmae_chan *sh_chan)
266} 265}
267 266
268static struct sh_dmae_slave_config *sh_dmae_find_slave( 267static struct sh_dmae_slave_config *sh_dmae_find_slave(
269 struct sh_dmae_chan *sh_chan, enum sh_dmae_slave_chan_id slave_id) 268 struct sh_dmae_chan *sh_chan, struct sh_dmae_slave *param)
270{ 269{
271 struct dma_device *dma_dev = sh_chan->common.device; 270 struct dma_device *dma_dev = sh_chan->common.device;
272 struct sh_dmae_device *shdev = container_of(dma_dev, 271 struct sh_dmae_device *shdev = container_of(dma_dev,
@@ -274,11 +273,11 @@ static struct sh_dmae_slave_config *sh_dmae_find_slave(
274 struct sh_dmae_pdata *pdata = shdev->pdata; 273 struct sh_dmae_pdata *pdata = shdev->pdata;
275 int i; 274 int i;
276 275
277 if ((unsigned)slave_id >= SHDMA_SLAVE_NUMBER) 276 if (param->slave_id >= SH_DMA_SLAVE_NUMBER)
278 return NULL; 277 return NULL;
279 278
280 for (i = 0; i < pdata->slave_num; i++) 279 for (i = 0; i < pdata->slave_num; i++)
281 if (pdata->slave[i].slave_id == slave_id) 280 if (pdata->slave[i].slave_id == param->slave_id)
282 return pdata->slave + i; 281 return pdata->slave + i;
283 282
284 return NULL; 283 return NULL;
@@ -299,7 +298,7 @@ static int sh_dmae_alloc_chan_resources(struct dma_chan *chan)
299 if (param) { 298 if (param) {
300 struct sh_dmae_slave_config *cfg; 299 struct sh_dmae_slave_config *cfg;
301 300
302 cfg = sh_dmae_find_slave(sh_chan, param->slave_id); 301 cfg = sh_dmae_find_slave(sh_chan, param);
303 if (!cfg) 302 if (!cfg)
304 return -EINVAL; 303 return -EINVAL;
305 304
diff --git a/drivers/dma/shdma.h b/drivers/dma/shdma.h
index 153609a1e96c..4021275a0a43 100644
--- a/drivers/dma/shdma.h
+++ b/drivers/dma/shdma.h
@@ -17,8 +17,8 @@
17#include <linux/interrupt.h> 17#include <linux/interrupt.h>
18#include <linux/list.h> 18#include <linux/list.h>
19 19
20#include <asm/dmaengine.h> 20#define SH_DMAC_MAX_CHANNELS 6
21 21#define SH_DMA_SLAVE_NUMBER 256
22#define SH_DMA_TCR_MAX 0x00FFFFFF /* 16MB */ 22#define SH_DMA_TCR_MAX 0x00FFFFFF /* 16MB */
23 23
24struct device; 24struct device;
diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c
index 8fc91a019620..f5b6d9fe4def 100644
--- a/drivers/edac/edac_mce_amd.c
+++ b/drivers/edac/edac_mce_amd.c
@@ -316,7 +316,12 @@ void amd_decode_nb_mce(int node_id, struct err_regs *regs, int handle_errors)
316 if (regs->nbsh & K8_NBSH_ERR_CPU_VAL) 316 if (regs->nbsh & K8_NBSH_ERR_CPU_VAL)
317 pr_cont(", core: %u\n", (u8)(regs->nbsh & 0xf)); 317 pr_cont(", core: %u\n", (u8)(regs->nbsh & 0xf));
318 } else { 318 } else {
319 pr_cont(", core: %d\n", fls((regs->nbsh & 0xf) - 1)); 319 u8 assoc_cpus = regs->nbsh & 0xf;
320
321 if (assoc_cpus > 0)
322 pr_cont(", core: %d", fls(assoc_cpus) - 1);
323
324 pr_cont("\n");
320 } 325 }
321 326
322 pr_emerg("%s.\n", EXT_ERR_MSG(xec)); 327 pr_emerg("%s.\n", EXT_ERR_MSG(xec));
diff --git a/drivers/gpio/max730x.c b/drivers/gpio/max730x.c
index c9bced55f82b..4a7d662ff9b7 100644
--- a/drivers/gpio/max730x.c
+++ b/drivers/gpio/max730x.c
@@ -242,3 +242,7 @@ int __devexit __max730x_remove(struct device *dev)
242 return ret; 242 return ret;
243} 243}
244EXPORT_SYMBOL_GPL(__max730x_remove); 244EXPORT_SYMBOL_GPL(__max730x_remove);
245
246MODULE_AUTHOR("Juergen Beisert, Wolfram Sang");
247MODULE_LICENSE("GPL v2");
248MODULE_DESCRIPTION("MAX730x GPIO-Expanders, generic parts");
diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index 8bfc0bbf13e6..a9f8589490cf 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -1881,29 +1881,29 @@ struct drm_ioctl_desc i915_ioctls[] = {
1881 DRM_IOCTL_DEF(DRM_I915_GET_VBLANK_PIPE, i915_vblank_pipe_get, DRM_AUTH ), 1881 DRM_IOCTL_DEF(DRM_I915_GET_VBLANK_PIPE, i915_vblank_pipe_get, DRM_AUTH ),
1882 DRM_IOCTL_DEF(DRM_I915_VBLANK_SWAP, i915_vblank_swap, DRM_AUTH), 1882 DRM_IOCTL_DEF(DRM_I915_VBLANK_SWAP, i915_vblank_swap, DRM_AUTH),
1883 DRM_IOCTL_DEF(DRM_I915_HWS_ADDR, i915_set_status_page, DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY), 1883 DRM_IOCTL_DEF(DRM_I915_HWS_ADDR, i915_set_status_page, DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY),
1884 DRM_IOCTL_DEF(DRM_I915_GEM_INIT, i915_gem_init_ioctl, DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY), 1884 DRM_IOCTL_DEF(DRM_I915_GEM_INIT, i915_gem_init_ioctl, DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY|DRM_UNLOCKED),
1885 DRM_IOCTL_DEF(DRM_I915_GEM_EXECBUFFER, i915_gem_execbuffer, DRM_AUTH), 1885 DRM_IOCTL_DEF(DRM_I915_GEM_EXECBUFFER, i915_gem_execbuffer, DRM_AUTH|DRM_UNLOCKED),
1886 DRM_IOCTL_DEF(DRM_I915_GEM_EXECBUFFER2, i915_gem_execbuffer2, DRM_AUTH), 1886 DRM_IOCTL_DEF(DRM_I915_GEM_EXECBUFFER2, i915_gem_execbuffer2, DRM_AUTH|DRM_UNLOCKED),
1887 DRM_IOCTL_DEF(DRM_I915_GEM_PIN, i915_gem_pin_ioctl, DRM_AUTH|DRM_ROOT_ONLY), 1887 DRM_IOCTL_DEF(DRM_I915_GEM_PIN, i915_gem_pin_ioctl, DRM_AUTH|DRM_ROOT_ONLY|DRM_UNLOCKED),
1888 DRM_IOCTL_DEF(DRM_I915_GEM_UNPIN, i915_gem_unpin_ioctl, DRM_AUTH|DRM_ROOT_ONLY), 1888 DRM_IOCTL_DEF(DRM_I915_GEM_UNPIN, i915_gem_unpin_ioctl, DRM_AUTH|DRM_ROOT_ONLY|DRM_UNLOCKED),
1889 DRM_IOCTL_DEF(DRM_I915_GEM_BUSY, i915_gem_busy_ioctl, DRM_AUTH), 1889 DRM_IOCTL_DEF(DRM_I915_GEM_BUSY, i915_gem_busy_ioctl, DRM_AUTH|DRM_UNLOCKED),
1890 DRM_IOCTL_DEF(DRM_I915_GEM_THROTTLE, i915_gem_throttle_ioctl, DRM_AUTH), 1890 DRM_IOCTL_DEF(DRM_I915_GEM_THROTTLE, i915_gem_throttle_ioctl, DRM_AUTH|DRM_UNLOCKED),
1891 DRM_IOCTL_DEF(DRM_I915_GEM_ENTERVT, i915_gem_entervt_ioctl, DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY), 1891 DRM_IOCTL_DEF(DRM_I915_GEM_ENTERVT, i915_gem_entervt_ioctl, DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY|DRM_UNLOCKED),
1892 DRM_IOCTL_DEF(DRM_I915_GEM_LEAVEVT, i915_gem_leavevt_ioctl, DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY), 1892 DRM_IOCTL_DEF(DRM_I915_GEM_LEAVEVT, i915_gem_leavevt_ioctl, DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY|DRM_UNLOCKED),
1893 DRM_IOCTL_DEF(DRM_I915_GEM_CREATE, i915_gem_create_ioctl, 0), 1893 DRM_IOCTL_DEF(DRM_I915_GEM_CREATE, i915_gem_create_ioctl, DRM_UNLOCKED),
1894 DRM_IOCTL_DEF(DRM_I915_GEM_PREAD, i915_gem_pread_ioctl, 0), 1894 DRM_IOCTL_DEF(DRM_I915_GEM_PREAD, i915_gem_pread_ioctl, DRM_UNLOCKED),
1895 DRM_IOCTL_DEF(DRM_I915_GEM_PWRITE, i915_gem_pwrite_ioctl, 0), 1895 DRM_IOCTL_DEF(DRM_I915_GEM_PWRITE, i915_gem_pwrite_ioctl, DRM_UNLOCKED),
1896 DRM_IOCTL_DEF(DRM_I915_GEM_MMAP, i915_gem_mmap_ioctl, 0), 1896 DRM_IOCTL_DEF(DRM_I915_GEM_MMAP, i915_gem_mmap_ioctl, DRM_UNLOCKED),
1897 DRM_IOCTL_DEF(DRM_I915_GEM_MMAP_GTT, i915_gem_mmap_gtt_ioctl, 0), 1897 DRM_IOCTL_DEF(DRM_I915_GEM_MMAP_GTT, i915_gem_mmap_gtt_ioctl, DRM_UNLOCKED),
1898 DRM_IOCTL_DEF(DRM_I915_GEM_SET_DOMAIN, i915_gem_set_domain_ioctl, 0), 1898 DRM_IOCTL_DEF(DRM_I915_GEM_SET_DOMAIN, i915_gem_set_domain_ioctl, DRM_UNLOCKED),
1899 DRM_IOCTL_DEF(DRM_I915_GEM_SW_FINISH, i915_gem_sw_finish_ioctl, 0), 1899 DRM_IOCTL_DEF(DRM_I915_GEM_SW_FINISH, i915_gem_sw_finish_ioctl, DRM_UNLOCKED),
1900 DRM_IOCTL_DEF(DRM_I915_GEM_SET_TILING, i915_gem_set_tiling, 0), 1900 DRM_IOCTL_DEF(DRM_I915_GEM_SET_TILING, i915_gem_set_tiling, DRM_UNLOCKED),
1901 DRM_IOCTL_DEF(DRM_I915_GEM_GET_TILING, i915_gem_get_tiling, 0), 1901 DRM_IOCTL_DEF(DRM_I915_GEM_GET_TILING, i915_gem_get_tiling, DRM_UNLOCKED),
1902 DRM_IOCTL_DEF(DRM_I915_GEM_GET_APERTURE, i915_gem_get_aperture_ioctl, 0), 1902 DRM_IOCTL_DEF(DRM_I915_GEM_GET_APERTURE, i915_gem_get_aperture_ioctl, DRM_UNLOCKED),
1903 DRM_IOCTL_DEF(DRM_I915_GET_PIPE_FROM_CRTC_ID, intel_get_pipe_from_crtc_id, 0), 1903 DRM_IOCTL_DEF(DRM_I915_GET_PIPE_FROM_CRTC_ID, intel_get_pipe_from_crtc_id, DRM_UNLOCKED),
1904 DRM_IOCTL_DEF(DRM_I915_GEM_MADVISE, i915_gem_madvise_ioctl, 0), 1904 DRM_IOCTL_DEF(DRM_I915_GEM_MADVISE, i915_gem_madvise_ioctl, DRM_UNLOCKED),
1905 DRM_IOCTL_DEF(DRM_I915_OVERLAY_PUT_IMAGE, intel_overlay_put_image, DRM_MASTER|DRM_CONTROL_ALLOW), 1905 DRM_IOCTL_DEF(DRM_I915_OVERLAY_PUT_IMAGE, intel_overlay_put_image, DRM_MASTER|DRM_CONTROL_ALLOW|DRM_UNLOCKED),
1906 DRM_IOCTL_DEF(DRM_I915_OVERLAY_ATTRS, intel_overlay_attrs, DRM_MASTER|DRM_CONTROL_ALLOW), 1906 DRM_IOCTL_DEF(DRM_I915_OVERLAY_ATTRS, intel_overlay_attrs, DRM_MASTER|DRM_CONTROL_ALLOW|DRM_UNLOCKED),
1907}; 1907};
1908 1908
1909int i915_max_ioctl = DRM_ARRAY_SIZE(i915_ioctls); 1909int i915_max_ioctl = DRM_ARRAY_SIZE(i915_ioctls);
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 1b2e95455c05..4b26919abdb2 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -139,12 +139,12 @@ const static struct intel_device_info intel_ironlake_m_info = {
139 139
140const static struct intel_device_info intel_sandybridge_d_info = { 140const static struct intel_device_info intel_sandybridge_d_info = {
141 .is_i965g = 1, .is_i9xx = 1, .need_gfx_hws = 1, 141 .is_i965g = 1, .is_i9xx = 1, .need_gfx_hws = 1,
142 .has_hotplug = 1, 142 .has_hotplug = 1, .is_gen6 = 1,
143}; 143};
144 144
145const static struct intel_device_info intel_sandybridge_m_info = { 145const static struct intel_device_info intel_sandybridge_m_info = {
146 .is_i965g = 1, .is_mobile = 1, .is_i9xx = 1, .need_gfx_hws = 1, 146 .is_i965g = 1, .is_mobile = 1, .is_i9xx = 1, .need_gfx_hws = 1,
147 .has_hotplug = 1, 147 .has_hotplug = 1, .is_gen6 = 1,
148}; 148};
149 149
150const static struct pci_device_id pciidlist[] = { 150const static struct pci_device_id pciidlist[] = {
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 979439cfb827..aba8260fbc5e 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -205,6 +205,7 @@ struct intel_device_info {
205 u8 is_g4x : 1; 205 u8 is_g4x : 1;
206 u8 is_pineview : 1; 206 u8 is_pineview : 1;
207 u8 is_ironlake : 1; 207 u8 is_ironlake : 1;
208 u8 is_gen6 : 1;
208 u8 has_fbc : 1; 209 u8 has_fbc : 1;
209 u8 has_rc6 : 1; 210 u8 has_rc6 : 1;
210 u8 has_pipe_cxsr : 1; 211 u8 has_pipe_cxsr : 1;
@@ -1084,6 +1085,7 @@ extern int i915_wait_ring(struct drm_device * dev, int n, const char *caller);
1084#define IS_IRONLAKE_M(dev) ((dev)->pci_device == 0x0046) 1085#define IS_IRONLAKE_M(dev) ((dev)->pci_device == 0x0046)
1085#define IS_IRONLAKE(dev) (INTEL_INFO(dev)->is_ironlake) 1086#define IS_IRONLAKE(dev) (INTEL_INFO(dev)->is_ironlake)
1086#define IS_I9XX(dev) (INTEL_INFO(dev)->is_i9xx) 1087#define IS_I9XX(dev) (INTEL_INFO(dev)->is_i9xx)
1088#define IS_GEN6(dev) (INTEL_INFO(dev)->is_gen6)
1087#define IS_MOBILE(dev) (INTEL_INFO(dev)->is_mobile) 1089#define IS_MOBILE(dev) (INTEL_INFO(dev)->is_mobile)
1088 1090
1089#define IS_GEN3(dev) (IS_I915G(dev) || \ 1091#define IS_GEN3(dev) (IS_I915G(dev) || \
@@ -1107,8 +1109,6 @@ extern int i915_wait_ring(struct drm_device * dev, int n, const char *caller);
1107 1109
1108#define I915_NEED_GFX_HWS(dev) (INTEL_INFO(dev)->need_gfx_hws) 1110#define I915_NEED_GFX_HWS(dev) (INTEL_INFO(dev)->need_gfx_hws)
1109 1111
1110#define IS_GEN6(dev) ((dev)->pci_device == 0x0102)
1111
1112/* With the 945 and later, Y tiling got adjusted so that it was 32 128-byte 1112/* With the 945 and later, Y tiling got adjusted so that it was 32 128-byte
1113 * rows, which changed the alignment requirements and fence programming. 1113 * rows, which changed the alignment requirements and fence programming.
1114 */ 1114 */
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index fba37e9f775d..933e865a8929 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1466,9 +1466,6 @@ i915_gem_object_put_pages(struct drm_gem_object *obj)
1466 obj_priv->dirty = 0; 1466 obj_priv->dirty = 0;
1467 1467
1468 for (i = 0; i < page_count; i++) { 1468 for (i = 0; i < page_count; i++) {
1469 if (obj_priv->pages[i] == NULL)
1470 break;
1471
1472 if (obj_priv->dirty) 1469 if (obj_priv->dirty)
1473 set_page_dirty(obj_priv->pages[i]); 1470 set_page_dirty(obj_priv->pages[i]);
1474 1471
@@ -2227,11 +2224,6 @@ i915_gem_evict_something(struct drm_device *dev, int min_size)
2227 seqno = i915_add_request(dev, NULL, obj->write_domain); 2224 seqno = i915_add_request(dev, NULL, obj->write_domain);
2228 if (seqno == 0) 2225 if (seqno == 0)
2229 return -ENOMEM; 2226 return -ENOMEM;
2230
2231 ret = i915_wait_request(dev, seqno);
2232 if (ret)
2233 return ret;
2234
2235 continue; 2227 continue;
2236 } 2228 }
2237 } 2229 }
@@ -2256,7 +2248,6 @@ i915_gem_object_get_pages(struct drm_gem_object *obj,
2256 struct address_space *mapping; 2248 struct address_space *mapping;
2257 struct inode *inode; 2249 struct inode *inode;
2258 struct page *page; 2250 struct page *page;
2259 int ret;
2260 2251
2261 if (obj_priv->pages_refcount++ != 0) 2252 if (obj_priv->pages_refcount++ != 0)
2262 return 0; 2253 return 0;
@@ -2279,11 +2270,9 @@ i915_gem_object_get_pages(struct drm_gem_object *obj,
2279 mapping_gfp_mask (mapping) | 2270 mapping_gfp_mask (mapping) |
2280 __GFP_COLD | 2271 __GFP_COLD |
2281 gfpmask); 2272 gfpmask);
2282 if (IS_ERR(page)) { 2273 if (IS_ERR(page))
2283 ret = PTR_ERR(page); 2274 goto err_pages;
2284 i915_gem_object_put_pages(obj); 2275
2285 return ret;
2286 }
2287 obj_priv->pages[i] = page; 2276 obj_priv->pages[i] = page;
2288 } 2277 }
2289 2278
@@ -2291,6 +2280,15 @@ i915_gem_object_get_pages(struct drm_gem_object *obj,
2291 i915_gem_object_do_bit_17_swizzle(obj); 2280 i915_gem_object_do_bit_17_swizzle(obj);
2292 2281
2293 return 0; 2282 return 0;
2283
2284err_pages:
2285 while (i--)
2286 page_cache_release(obj_priv->pages[i]);
2287
2288 drm_free_large(obj_priv->pages);
2289 obj_priv->pages = NULL;
2290 obj_priv->pages_refcount--;
2291 return PTR_ERR(page);
2294} 2292}
2295 2293
2296static void sandybridge_write_fence_reg(struct drm_i915_fence_reg *reg) 2294static void sandybridge_write_fence_reg(struct drm_i915_fence_reg *reg)
@@ -4730,6 +4728,11 @@ i915_gem_init_ringbuffer(struct drm_device *dev)
4730 ring->space += ring->Size; 4728 ring->space += ring->Size;
4731 } 4729 }
4732 4730
4731 if (IS_I9XX(dev) && !IS_GEN3(dev)) {
4732 I915_WRITE(MI_MODE,
4733 (VS_TIMER_DISPATCH) << 16 | VS_TIMER_DISPATCH);
4734 }
4735
4733 return 0; 4736 return 0;
4734} 4737}
4735 4738
diff --git a/drivers/gpu/drm/i915/i915_gem_tiling.c b/drivers/gpu/drm/i915/i915_gem_tiling.c
index b5c55d88ff76..c01c878e51ba 100644
--- a/drivers/gpu/drm/i915/i915_gem_tiling.c
+++ b/drivers/gpu/drm/i915/i915_gem_tiling.c
@@ -325,9 +325,12 @@ i915_gem_set_tiling(struct drm_device *dev, void *data,
325 * need to ensure that any fence register is cleared. 325 * need to ensure that any fence register is cleared.
326 */ 326 */
327 if (!i915_gem_object_fence_offset_ok(obj, args->tiling_mode)) 327 if (!i915_gem_object_fence_offset_ok(obj, args->tiling_mode))
328 ret = i915_gem_object_unbind(obj); 328 ret = i915_gem_object_unbind(obj);
329 else if (obj_priv->fence_reg != I915_FENCE_REG_NONE)
330 ret = i915_gem_object_put_fence_reg(obj);
329 else 331 else
330 ret = i915_gem_object_put_fence_reg(obj); 332 i915_gem_release_mmap(obj);
333
331 if (ret != 0) { 334 if (ret != 0) {
332 WARN(ret != -ERESTARTSYS, 335 WARN(ret != -ERESTARTSYS,
333 "failed to reset object for tiling switch"); 336 "failed to reset object for tiling switch");
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 3d59862c7ccd..cbbf59f56dfa 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -298,6 +298,10 @@
298#define INSTDONE 0x02090 298#define INSTDONE 0x02090
299#define NOPID 0x02094 299#define NOPID 0x02094
300#define HWSTAM 0x02098 300#define HWSTAM 0x02098
301
302#define MI_MODE 0x0209c
303# define VS_TIMER_DISPATCH (1 << 6)
304
301#define SCPD0 0x0209c /* 915+ only */ 305#define SCPD0 0x0209c /* 915+ only */
302#define IER 0x020a0 306#define IER 0x020a0
303#define IIR 0x020a4 307#define IIR 0x020a4
@@ -366,7 +370,7 @@
366#define FBC_CTL_PERIODIC (1<<30) 370#define FBC_CTL_PERIODIC (1<<30)
367#define FBC_CTL_INTERVAL_SHIFT (16) 371#define FBC_CTL_INTERVAL_SHIFT (16)
368#define FBC_CTL_UNCOMPRESSIBLE (1<<14) 372#define FBC_CTL_UNCOMPRESSIBLE (1<<14)
369#define FBC_C3_IDLE (1<<13) 373#define FBC_CTL_C3_IDLE (1<<13)
370#define FBC_CTL_STRIDE_SHIFT (5) 374#define FBC_CTL_STRIDE_SHIFT (5)
371#define FBC_CTL_FENCENO (1<<0) 375#define FBC_CTL_FENCENO (1<<0)
372#define FBC_COMMAND 0x0320c 376#define FBC_COMMAND 0x0320c
@@ -2172,6 +2176,14 @@
2172#define DISPLAY_PORT_PLL_BIOS_1 0x46010 2176#define DISPLAY_PORT_PLL_BIOS_1 0x46010
2173#define DISPLAY_PORT_PLL_BIOS_2 0x46014 2177#define DISPLAY_PORT_PLL_BIOS_2 0x46014
2174 2178
2179#define PCH_DSPCLK_GATE_D 0x42020
2180# define DPFDUNIT_CLOCK_GATE_DISABLE (1 << 7)
2181# define DPARBUNIT_CLOCK_GATE_DISABLE (1 << 5)
2182
2183#define PCH_3DCGDIS0 0x46020
2184# define MARIUNIT_CLOCK_GATE_DISABLE (1 << 18)
2185# define SVSMUNIT_CLOCK_GATE_DISABLE (1 << 1)
2186
2175#define FDI_PLL_FREQ_CTL 0x46030 2187#define FDI_PLL_FREQ_CTL 0x46030
2176#define FDI_PLL_FREQ_CHANGE_REQUEST (1<<24) 2188#define FDI_PLL_FREQ_CHANGE_REQUEST (1<<24)
2177#define FDI_PLL_FREQ_LOCK_LIMIT_MASK 0xfff00 2189#define FDI_PLL_FREQ_LOCK_LIMIT_MASK 0xfff00
diff --git a/drivers/gpu/drm/i915/intel_bios.c b/drivers/gpu/drm/i915/intel_bios.c
index 70c9d4ba7042..f9ba452f0cbf 100644
--- a/drivers/gpu/drm/i915/intel_bios.c
+++ b/drivers/gpu/drm/i915/intel_bios.c
@@ -417,8 +417,9 @@ parse_edp(struct drm_i915_private *dev_priv, struct bdb_header *bdb)
417 edp = find_section(bdb, BDB_EDP); 417 edp = find_section(bdb, BDB_EDP);
418 if (!edp) { 418 if (!edp) {
419 if (SUPPORTS_EDP(dev_priv->dev) && dev_priv->edp_support) { 419 if (SUPPORTS_EDP(dev_priv->dev) && dev_priv->edp_support) {
420 DRM_DEBUG_KMS("No eDP BDB found but eDP panel supported,\ 420 DRM_DEBUG_KMS("No eDP BDB found but eDP panel "
421 assume 18bpp panel color depth.\n"); 421 "supported, assume 18bpp panel color "
422 "depth.\n");
422 dev_priv->edp_bpp = 18; 423 dev_priv->edp_bpp = 18;
423 } 424 }
424 return; 425 return;
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 9cd6de5f9906..58fc7fa0eb1d 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -1032,7 +1032,7 @@ static void i8xx_enable_fbc(struct drm_crtc *crtc, unsigned long interval)
1032 /* enable it... */ 1032 /* enable it... */
1033 fbc_ctl = FBC_CTL_EN | FBC_CTL_PERIODIC; 1033 fbc_ctl = FBC_CTL_EN | FBC_CTL_PERIODIC;
1034 if (IS_I945GM(dev)) 1034 if (IS_I945GM(dev))
1035 fbc_ctl |= FBC_C3_IDLE; /* 945 needs special SR handling */ 1035 fbc_ctl |= FBC_CTL_C3_IDLE; /* 945 needs special SR handling */
1036 fbc_ctl |= (dev_priv->cfb_pitch & 0xff) << FBC_CTL_STRIDE_SHIFT; 1036 fbc_ctl |= (dev_priv->cfb_pitch & 0xff) << FBC_CTL_STRIDE_SHIFT;
1037 fbc_ctl |= (interval & 0x2fff) << FBC_CTL_INTERVAL_SHIFT; 1037 fbc_ctl |= (interval & 0x2fff) << FBC_CTL_INTERVAL_SHIFT;
1038 if (obj_priv->tiling_mode != I915_TILING_NONE) 1038 if (obj_priv->tiling_mode != I915_TILING_NONE)
@@ -4717,6 +4717,20 @@ void intel_init_clock_gating(struct drm_device *dev)
4717 * specs, but enable as much else as we can. 4717 * specs, but enable as much else as we can.
4718 */ 4718 */
4719 if (HAS_PCH_SPLIT(dev)) { 4719 if (HAS_PCH_SPLIT(dev)) {
4720 uint32_t dspclk_gate = VRHUNIT_CLOCK_GATE_DISABLE;
4721
4722 if (IS_IRONLAKE(dev)) {
4723 /* Required for FBC */
4724 dspclk_gate |= DPFDUNIT_CLOCK_GATE_DISABLE;
4725 /* Required for CxSR */
4726 dspclk_gate |= DPARBUNIT_CLOCK_GATE_DISABLE;
4727
4728 I915_WRITE(PCH_3DCGDIS0,
4729 MARIUNIT_CLOCK_GATE_DISABLE |
4730 SVSMUNIT_CLOCK_GATE_DISABLE);
4731 }
4732
4733 I915_WRITE(PCH_DSPCLK_GATE_D, dspclk_gate);
4720 return; 4734 return;
4721 } else if (IS_G4X(dev)) { 4735 } else if (IS_G4X(dev)) {
4722 uint32_t dspclk_gate; 4736 uint32_t dspclk_gate;
diff --git a/drivers/gpu/drm/i915/intel_lvds.c b/drivers/gpu/drm/i915/intel_lvds.c
index 14e516fdc2dd..2b3fa7a3c028 100644
--- a/drivers/gpu/drm/i915/intel_lvds.c
+++ b/drivers/gpu/drm/i915/intel_lvds.c
@@ -607,53 +607,6 @@ static void intel_lvds_mode_set(struct drm_encoder *encoder,
607 I915_WRITE(PFIT_CONTROL, lvds_priv->pfit_control); 607 I915_WRITE(PFIT_CONTROL, lvds_priv->pfit_control);
608} 608}
609 609
610/* Some lid devices report incorrect lid status, assume they're connected */
611static const struct dmi_system_id bad_lid_status[] = {
612 {
613 .ident = "Compaq nx9020",
614 .matches = {
615 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
616 DMI_MATCH(DMI_BOARD_NAME, "3084"),
617 },
618 },
619 {
620 .ident = "Samsung SX20S",
621 .matches = {
622 DMI_MATCH(DMI_SYS_VENDOR, "Samsung Electronics"),
623 DMI_MATCH(DMI_BOARD_NAME, "SX20S"),
624 },
625 },
626 {
627 .ident = "Aspire One",
628 .matches = {
629 DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
630 DMI_MATCH(DMI_PRODUCT_NAME, "Aspire one"),
631 },
632 },
633 {
634 .ident = "Aspire 1810T",
635 .matches = {
636 DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
637 DMI_MATCH(DMI_PRODUCT_NAME, "Aspire 1810T"),
638 },
639 },
640 {
641 .ident = "PC-81005",
642 .matches = {
643 DMI_MATCH(DMI_SYS_VENDOR, "MALATA"),
644 DMI_MATCH(DMI_PRODUCT_NAME, "PC-81005"),
645 },
646 },
647 {
648 .ident = "Clevo M5x0N",
649 .matches = {
650 DMI_MATCH(DMI_SYS_VENDOR, "CLEVO Co."),
651 DMI_MATCH(DMI_BOARD_NAME, "M5x0N"),
652 },
653 },
654 { }
655};
656
657/** 610/**
658 * Detect the LVDS connection. 611 * Detect the LVDS connection.
659 * 612 *
@@ -669,12 +622,9 @@ static enum drm_connector_status intel_lvds_detect(struct drm_connector *connect
669 /* ACPI lid methods were generally unreliable in this generation, so 622 /* ACPI lid methods were generally unreliable in this generation, so
670 * don't even bother. 623 * don't even bother.
671 */ 624 */
672 if (IS_GEN2(dev)) 625 if (IS_GEN2(dev) || IS_GEN3(dev))
673 return connector_status_connected; 626 return connector_status_connected;
674 627
675 if (!dmi_check_system(bad_lid_status) && !acpi_lid_open())
676 status = connector_status_disconnected;
677
678 return status; 628 return status;
679} 629}
680 630
diff --git a/drivers/gpu/drm/i915/intel_overlay.c b/drivers/gpu/drm/i915/intel_overlay.c
index d355d1d527e7..60595fc26fdd 100644
--- a/drivers/gpu/drm/i915/intel_overlay.c
+++ b/drivers/gpu/drm/i915/intel_overlay.c
@@ -1068,14 +1068,18 @@ int intel_overlay_put_image(struct drm_device *dev, void *data,
1068 1068
1069 drmmode_obj = drm_mode_object_find(dev, put_image_rec->crtc_id, 1069 drmmode_obj = drm_mode_object_find(dev, put_image_rec->crtc_id,
1070 DRM_MODE_OBJECT_CRTC); 1070 DRM_MODE_OBJECT_CRTC);
1071 if (!drmmode_obj) 1071 if (!drmmode_obj) {
1072 return -ENOENT; 1072 ret = -ENOENT;
1073 goto out_free;
1074 }
1073 crtc = to_intel_crtc(obj_to_crtc(drmmode_obj)); 1075 crtc = to_intel_crtc(obj_to_crtc(drmmode_obj));
1074 1076
1075 new_bo = drm_gem_object_lookup(dev, file_priv, 1077 new_bo = drm_gem_object_lookup(dev, file_priv,
1076 put_image_rec->bo_handle); 1078 put_image_rec->bo_handle);
1077 if (!new_bo) 1079 if (!new_bo) {
1078 return -ENOENT; 1080 ret = -ENOENT;
1081 goto out_free;
1082 }
1079 1083
1080 mutex_lock(&dev->mode_config.mutex); 1084 mutex_lock(&dev->mode_config.mutex);
1081 mutex_lock(&dev->struct_mutex); 1085 mutex_lock(&dev->struct_mutex);
@@ -1165,6 +1169,7 @@ out_unlock:
1165 mutex_unlock(&dev->struct_mutex); 1169 mutex_unlock(&dev->struct_mutex);
1166 mutex_unlock(&dev->mode_config.mutex); 1170 mutex_unlock(&dev->mode_config.mutex);
1167 drm_gem_object_unreference_unlocked(new_bo); 1171 drm_gem_object_unreference_unlocked(new_bo);
1172out_free:
1168 kfree(params); 1173 kfree(params);
1169 1174
1170 return ret; 1175 return ret;
diff --git a/drivers/i2c/busses/i2c-scmi.c b/drivers/i2c/busses/i2c-scmi.c
index 365e0becaf12..388cbdc96db7 100644
--- a/drivers/i2c/busses/i2c-scmi.c
+++ b/drivers/i2c/busses/i2c-scmi.c
@@ -33,6 +33,7 @@ struct acpi_smbus_cmi {
33 u8 cap_info:1; 33 u8 cap_info:1;
34 u8 cap_read:1; 34 u8 cap_read:1;
35 u8 cap_write:1; 35 u8 cap_write:1;
36 struct smbus_methods_t *methods;
36}; 37};
37 38
38static const struct smbus_methods_t smbus_methods = { 39static const struct smbus_methods_t smbus_methods = {
@@ -41,10 +42,19 @@ static const struct smbus_methods_t smbus_methods = {
41 .mt_sbw = "_SBW", 42 .mt_sbw = "_SBW",
42}; 43};
43 44
45/* Some IBM BIOSes omit the leading underscore */
46static const struct smbus_methods_t ibm_smbus_methods = {
47 .mt_info = "SBI_",
48 .mt_sbr = "SBR_",
49 .mt_sbw = "SBW_",
50};
51
44static const struct acpi_device_id acpi_smbus_cmi_ids[] = { 52static const struct acpi_device_id acpi_smbus_cmi_ids[] = {
45 {"SMBUS01", 0}, 53 {"SMBUS01", (kernel_ulong_t)&smbus_methods},
54 {ACPI_SMBUS_IBM_HID, (kernel_ulong_t)&ibm_smbus_methods},
46 {"", 0} 55 {"", 0}
47}; 56};
57MODULE_DEVICE_TABLE(acpi, acpi_smbus_cmi_ids);
48 58
49#define ACPI_SMBUS_STATUS_OK 0x00 59#define ACPI_SMBUS_STATUS_OK 0x00
50#define ACPI_SMBUS_STATUS_FAIL 0x07 60#define ACPI_SMBUS_STATUS_FAIL 0x07
@@ -150,11 +160,11 @@ acpi_smbus_cmi_access(struct i2c_adapter *adap, u16 addr, unsigned short flags,
150 160
151 if (read_write == I2C_SMBUS_READ) { 161 if (read_write == I2C_SMBUS_READ) {
152 protocol |= ACPI_SMBUS_PRTCL_READ; 162 protocol |= ACPI_SMBUS_PRTCL_READ;
153 method = smbus_methods.mt_sbr; 163 method = smbus_cmi->methods->mt_sbr;
154 input.count = 3; 164 input.count = 3;
155 } else { 165 } else {
156 protocol |= ACPI_SMBUS_PRTCL_WRITE; 166 protocol |= ACPI_SMBUS_PRTCL_WRITE;
157 method = smbus_methods.mt_sbw; 167 method = smbus_cmi->methods->mt_sbw;
158 input.count = 5; 168 input.count = 5;
159 } 169 }
160 170
@@ -290,13 +300,13 @@ static int acpi_smbus_cmi_add_cap(struct acpi_smbus_cmi *smbus_cmi,
290 union acpi_object *obj; 300 union acpi_object *obj;
291 acpi_status status; 301 acpi_status status;
292 302
293 if (!strcmp(name, smbus_methods.mt_info)) { 303 if (!strcmp(name, smbus_cmi->methods->mt_info)) {
294 status = acpi_evaluate_object(smbus_cmi->handle, 304 status = acpi_evaluate_object(smbus_cmi->handle,
295 smbus_methods.mt_info, 305 smbus_cmi->methods->mt_info,
296 NULL, &buffer); 306 NULL, &buffer);
297 if (ACPI_FAILURE(status)) { 307 if (ACPI_FAILURE(status)) {
298 ACPI_ERROR((AE_INFO, "Evaluating %s: %i", 308 ACPI_ERROR((AE_INFO, "Evaluating %s: %i",
299 smbus_methods.mt_info, status)); 309 smbus_cmi->methods->mt_info, status));
300 return -EIO; 310 return -EIO;
301 } 311 }
302 312
@@ -319,9 +329,9 @@ static int acpi_smbus_cmi_add_cap(struct acpi_smbus_cmi *smbus_cmi,
319 329
320 kfree(buffer.pointer); 330 kfree(buffer.pointer);
321 smbus_cmi->cap_info = 1; 331 smbus_cmi->cap_info = 1;
322 } else if (!strcmp(name, smbus_methods.mt_sbr)) 332 } else if (!strcmp(name, smbus_cmi->methods->mt_sbr))
323 smbus_cmi->cap_read = 1; 333 smbus_cmi->cap_read = 1;
324 else if (!strcmp(name, smbus_methods.mt_sbw)) 334 else if (!strcmp(name, smbus_cmi->methods->mt_sbw))
325 smbus_cmi->cap_write = 1; 335 smbus_cmi->cap_write = 1;
326 else 336 else
327 ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Unsupported CMI method: %s\n", 337 ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Unsupported CMI method: %s\n",
@@ -349,6 +359,7 @@ static acpi_status acpi_smbus_cmi_query_methods(acpi_handle handle, u32 level,
349static int acpi_smbus_cmi_add(struct acpi_device *device) 359static int acpi_smbus_cmi_add(struct acpi_device *device)
350{ 360{
351 struct acpi_smbus_cmi *smbus_cmi; 361 struct acpi_smbus_cmi *smbus_cmi;
362 const struct acpi_device_id *id;
352 363
353 smbus_cmi = kzalloc(sizeof(struct acpi_smbus_cmi), GFP_KERNEL); 364 smbus_cmi = kzalloc(sizeof(struct acpi_smbus_cmi), GFP_KERNEL);
354 if (!smbus_cmi) 365 if (!smbus_cmi)
@@ -362,6 +373,11 @@ static int acpi_smbus_cmi_add(struct acpi_device *device)
362 smbus_cmi->cap_read = 0; 373 smbus_cmi->cap_read = 0;
363 smbus_cmi->cap_write = 0; 374 smbus_cmi->cap_write = 0;
364 375
376 for (id = acpi_smbus_cmi_ids; id->id[0]; id++)
377 if (!strcmp(id->id, acpi_device_hid(device)))
378 smbus_cmi->methods =
379 (struct smbus_methods_t *) id->driver_data;
380
365 acpi_walk_namespace(ACPI_TYPE_METHOD, smbus_cmi->handle, 1, 381 acpi_walk_namespace(ACPI_TYPE_METHOD, smbus_cmi->handle, 1,
366 acpi_smbus_cmi_query_methods, NULL, smbus_cmi, NULL); 382 acpi_smbus_cmi_query_methods, NULL, smbus_cmi, NULL);
367 383
diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
index 1558bb7fc74d..f901957abc8b 100644
--- a/drivers/infiniband/core/sysfs.c
+++ b/drivers/infiniband/core/sysfs.c
@@ -461,6 +461,7 @@ alloc_group_attrs(ssize_t (*show)(struct ib_port *,
461 element->attr.attr.mode = S_IRUGO; 461 element->attr.attr.mode = S_IRUGO;
462 element->attr.show = show; 462 element->attr.show = show;
463 element->index = i; 463 element->index = i;
464 sysfs_attr_init(&element->attr.attr);
464 465
465 tab_attr[i] = &element->attr.attr; 466 tab_attr[i] = &element->attr.attr;
466 } 467 }
diff --git a/drivers/misc/c2port/core.c b/drivers/misc/c2port/core.c
index b5346b4db91a..b7a85f46a6c2 100644
--- a/drivers/misc/c2port/core.c
+++ b/drivers/misc/c2port/core.c
@@ -912,8 +912,8 @@ struct c2port_device *c2port_device_register(char *name,
912 912
913 c2dev->dev = device_create(c2port_class, NULL, 0, c2dev, 913 c2dev->dev = device_create(c2port_class, NULL, 0, c2dev,
914 "c2port%d", id); 914 "c2port%d", id);
915 if (unlikely(!c2dev->dev)) { 915 if (unlikely(IS_ERR(c2dev->dev))) {
916 ret = -ENOMEM; 916 ret = PTR_ERR(c2dev->dev);
917 goto error_device_create; 917 goto error_device_create;
918 } 918 }
919 dev_set_drvdata(c2dev->dev, c2dev); 919 dev_set_drvdata(c2dev->dev, c2dev);
diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index 0eac6c814904..e041c003db22 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -225,7 +225,7 @@ static int mmc_read_ext_csd(struct mmc_card *card)
225 mmc_card_set_blockaddr(card); 225 mmc_card_set_blockaddr(card);
226 } 226 }
227 227
228 switch (ext_csd[EXT_CSD_CARD_TYPE]) { 228 switch (ext_csd[EXT_CSD_CARD_TYPE] & EXT_CSD_CARD_TYPE_MASK) {
229 case EXT_CSD_CARD_TYPE_52 | EXT_CSD_CARD_TYPE_26: 229 case EXT_CSD_CARD_TYPE_52 | EXT_CSD_CARD_TYPE_26:
230 card->ext_csd.hs_max_dtr = 52000000; 230 card->ext_csd.hs_max_dtr = 52000000;
231 break; 231 break;
@@ -237,7 +237,6 @@ static int mmc_read_ext_csd(struct mmc_card *card)
237 printk(KERN_WARNING "%s: card is mmc v4 but doesn't " 237 printk(KERN_WARNING "%s: card is mmc v4 but doesn't "
238 "support any high-speed modes.\n", 238 "support any high-speed modes.\n",
239 mmc_hostname(card->host)); 239 mmc_hostname(card->host));
240 goto out;
241 } 240 }
242 241
243 if (card->ext_csd.rev >= 3) { 242 if (card->ext_csd.rev >= 3) {
diff --git a/drivers/mtd/maps/omap_nor.c b/drivers/mtd/maps/omap_nor.c
deleted file mode 100644
index e69de29bb2d1..000000000000
--- a/drivers/mtd/maps/omap_nor.c
+++ /dev/null
diff --git a/drivers/net/arm/ks8695net.c b/drivers/net/arm/ks8695net.c
index a1d4188c430b..e7810b74f396 100644
--- a/drivers/net/arm/ks8695net.c
+++ b/drivers/net/arm/ks8695net.c
@@ -449,11 +449,10 @@ ks8695_rx_irq(int irq, void *dev_id)
449} 449}
450 450
451/** 451/**
452 * ks8695_rx - Receive packets called by NAPI poll method 452 * ks8695_rx - Receive packets called by NAPI poll method
453 * @ksp: Private data for the KS8695 Ethernet 453 * @ksp: Private data for the KS8695 Ethernet
454 * @budget: The max packets would be receive 454 * @budget: Number of packets allowed to process
455 */ 455 */
456
457static int ks8695_rx(struct ks8695_priv *ksp, int budget) 456static int ks8695_rx(struct ks8695_priv *ksp, int budget)
458{ 457{
459 struct net_device *ndev = ksp->ndev; 458 struct net_device *ndev = ksp->ndev;
@@ -461,7 +460,6 @@ static int ks8695_rx(struct ks8695_priv *ksp, int budget)
461 int buff_n; 460 int buff_n;
462 u32 flags; 461 u32 flags;
463 int pktlen; 462 int pktlen;
464 int last_rx_processed = -1;
465 int received = 0; 463 int received = 0;
466 464
467 buff_n = ksp->next_rx_desc_read; 465 buff_n = ksp->next_rx_desc_read;
@@ -471,6 +469,7 @@ static int ks8695_rx(struct ks8695_priv *ksp, int budget)
471 cpu_to_le32(RDES_OWN)))) { 469 cpu_to_le32(RDES_OWN)))) {
472 rmb(); 470 rmb();
473 flags = le32_to_cpu(ksp->rx_ring[buff_n].status); 471 flags = le32_to_cpu(ksp->rx_ring[buff_n].status);
472
474 /* Found an SKB which we own, this means we 473 /* Found an SKB which we own, this means we
475 * received a packet 474 * received a packet
476 */ 475 */
@@ -533,23 +532,18 @@ rx_failure:
533 ksp->rx_ring[buff_n].status = cpu_to_le32(RDES_OWN); 532 ksp->rx_ring[buff_n].status = cpu_to_le32(RDES_OWN);
534rx_finished: 533rx_finished:
535 received++; 534 received++;
536 /* And note this as processed so we can start
537 * from here next time
538 */
539 last_rx_processed = buff_n;
540 buff_n = (buff_n + 1) & MAX_RX_DESC_MASK; 535 buff_n = (buff_n + 1) & MAX_RX_DESC_MASK;
541 /*And note which RX descriptor we last did */
542 if (likely(last_rx_processed != -1))
543 ksp->next_rx_desc_read =
544 (last_rx_processed + 1) &
545 MAX_RX_DESC_MASK;
546 } 536 }
537
538 /* And note which RX descriptor we last did */
539 ksp->next_rx_desc_read = buff_n;
540
547 /* And refill the buffers */ 541 /* And refill the buffers */
548 ks8695_refill_rxbuffers(ksp); 542 ks8695_refill_rxbuffers(ksp);
549 543
550 /* Kick the RX DMA engine, in case it became 544 /* Kick the RX DMA engine, in case it became suspended */
551 * suspended */
552 ks8695_writereg(ksp, KS8695_DRSC, 0); 545 ks8695_writereg(ksp, KS8695_DRSC, 0);
546
553 return received; 547 return received;
554} 548}
555 549
diff --git a/drivers/net/igb/e1000_82575.c b/drivers/net/igb/e1000_82575.c
index 9d7fa2fb85ea..0bc990ec4a8e 100644
--- a/drivers/net/igb/e1000_82575.c
+++ b/drivers/net/igb/e1000_82575.c
@@ -94,6 +94,7 @@ static s32 igb_get_invariants_82575(struct e1000_hw *hw)
94 case E1000_DEV_ID_82576_FIBER: 94 case E1000_DEV_ID_82576_FIBER:
95 case E1000_DEV_ID_82576_SERDES: 95 case E1000_DEV_ID_82576_SERDES:
96 case E1000_DEV_ID_82576_QUAD_COPPER: 96 case E1000_DEV_ID_82576_QUAD_COPPER:
97 case E1000_DEV_ID_82576_QUAD_COPPER_ET2:
97 case E1000_DEV_ID_82576_SERDES_QUAD: 98 case E1000_DEV_ID_82576_SERDES_QUAD:
98 mac->type = e1000_82576; 99 mac->type = e1000_82576;
99 break; 100 break;
diff --git a/drivers/net/igb/e1000_hw.h b/drivers/net/igb/e1000_hw.h
index 448005276b26..82a533f5192a 100644
--- a/drivers/net/igb/e1000_hw.h
+++ b/drivers/net/igb/e1000_hw.h
@@ -41,6 +41,7 @@ struct e1000_hw;
41#define E1000_DEV_ID_82576_FIBER 0x10E6 41#define E1000_DEV_ID_82576_FIBER 0x10E6
42#define E1000_DEV_ID_82576_SERDES 0x10E7 42#define E1000_DEV_ID_82576_SERDES 0x10E7
43#define E1000_DEV_ID_82576_QUAD_COPPER 0x10E8 43#define E1000_DEV_ID_82576_QUAD_COPPER 0x10E8
44#define E1000_DEV_ID_82576_QUAD_COPPER_ET2 0x1526
44#define E1000_DEV_ID_82576_NS 0x150A 45#define E1000_DEV_ID_82576_NS 0x150A
45#define E1000_DEV_ID_82576_NS_SERDES 0x1518 46#define E1000_DEV_ID_82576_NS_SERDES 0x1518
46#define E1000_DEV_ID_82576_SERDES_QUAD 0x150D 47#define E1000_DEV_ID_82576_SERDES_QUAD 0x150D
diff --git a/drivers/net/igb/igb_main.c b/drivers/net/igb/igb_main.c
index 0ed25f059a00..45a0e4fd5871 100644
--- a/drivers/net/igb/igb_main.c
+++ b/drivers/net/igb/igb_main.c
@@ -72,6 +72,7 @@ static DEFINE_PCI_DEVICE_TABLE(igb_pci_tbl) = {
72 { PCI_VDEVICE(INTEL, E1000_DEV_ID_82576_FIBER), board_82575 }, 72 { PCI_VDEVICE(INTEL, E1000_DEV_ID_82576_FIBER), board_82575 },
73 { PCI_VDEVICE(INTEL, E1000_DEV_ID_82576_SERDES), board_82575 }, 73 { PCI_VDEVICE(INTEL, E1000_DEV_ID_82576_SERDES), board_82575 },
74 { PCI_VDEVICE(INTEL, E1000_DEV_ID_82576_SERDES_QUAD), board_82575 }, 74 { PCI_VDEVICE(INTEL, E1000_DEV_ID_82576_SERDES_QUAD), board_82575 },
75 { PCI_VDEVICE(INTEL, E1000_DEV_ID_82576_QUAD_COPPER_ET2), board_82575 },
75 { PCI_VDEVICE(INTEL, E1000_DEV_ID_82576_QUAD_COPPER), board_82575 }, 76 { PCI_VDEVICE(INTEL, E1000_DEV_ID_82576_QUAD_COPPER), board_82575 },
76 { PCI_VDEVICE(INTEL, E1000_DEV_ID_82575EB_COPPER), board_82575 }, 77 { PCI_VDEVICE(INTEL, E1000_DEV_ID_82575EB_COPPER), board_82575 },
77 { PCI_VDEVICE(INTEL, E1000_DEV_ID_82575EB_FIBER_SERDES), board_82575 }, 78 { PCI_VDEVICE(INTEL, E1000_DEV_ID_82575EB_FIBER_SERDES), board_82575 },
diff --git a/drivers/net/ixgbe/ixgbe_82599.c b/drivers/net/ixgbe/ixgbe_82599.c
index 1f30e163bd9c..b405a00817c6 100644
--- a/drivers/net/ixgbe/ixgbe_82599.c
+++ b/drivers/net/ixgbe/ixgbe_82599.c
@@ -39,6 +39,7 @@
39#define IXGBE_82599_MC_TBL_SIZE 128 39#define IXGBE_82599_MC_TBL_SIZE 128
40#define IXGBE_82599_VFT_TBL_SIZE 128 40#define IXGBE_82599_VFT_TBL_SIZE 128
41 41
42void ixgbe_flap_tx_laser_multispeed_fiber(struct ixgbe_hw *hw);
42s32 ixgbe_setup_mac_link_multispeed_fiber(struct ixgbe_hw *hw, 43s32 ixgbe_setup_mac_link_multispeed_fiber(struct ixgbe_hw *hw,
43 ixgbe_link_speed speed, 44 ixgbe_link_speed speed,
44 bool autoneg, 45 bool autoneg,
@@ -68,7 +69,9 @@ static void ixgbe_init_mac_link_ops_82599(struct ixgbe_hw *hw)
68 if (hw->phy.multispeed_fiber) { 69 if (hw->phy.multispeed_fiber) {
69 /* Set up dual speed SFP+ support */ 70 /* Set up dual speed SFP+ support */
70 mac->ops.setup_link = &ixgbe_setup_mac_link_multispeed_fiber; 71 mac->ops.setup_link = &ixgbe_setup_mac_link_multispeed_fiber;
72 mac->ops.flap_tx_laser = &ixgbe_flap_tx_laser_multispeed_fiber;
71 } else { 73 } else {
74 mac->ops.flap_tx_laser = NULL;
72 if ((mac->ops.get_media_type(hw) == 75 if ((mac->ops.get_media_type(hw) ==
73 ixgbe_media_type_backplane) && 76 ixgbe_media_type_backplane) &&
74 (hw->phy.smart_speed == ixgbe_smart_speed_auto || 77 (hw->phy.smart_speed == ixgbe_smart_speed_auto ||
@@ -413,6 +416,41 @@ s32 ixgbe_start_mac_link_82599(struct ixgbe_hw *hw,
413} 416}
414 417
415/** 418/**
419 * ixgbe_flap_tx_laser_multispeed_fiber - Flap Tx laser
420 * @hw: pointer to hardware structure
421 *
422 * When the driver changes the link speeds that it can support,
423 * it sets autotry_restart to true to indicate that we need to
424 * initiate a new autotry session with the link partner. To do
425 * so, we set the speed then disable and re-enable the tx laser, to
426 * alert the link partner that it also needs to restart autotry on its
427 * end. This is consistent with true clause 37 autoneg, which also
428 * involves a loss of signal.
429 **/
430void ixgbe_flap_tx_laser_multispeed_fiber(struct ixgbe_hw *hw)
431{
432 u32 esdp_reg = IXGBE_READ_REG(hw, IXGBE_ESDP);
433
434 hw_dbg(hw, "ixgbe_flap_tx_laser_multispeed_fiber\n");
435
436 if (hw->mac.autotry_restart) {
437 /* Disable tx laser; allow 100us to go dark per spec */
438 esdp_reg |= IXGBE_ESDP_SDP3;
439 IXGBE_WRITE_REG(hw, IXGBE_ESDP, esdp_reg);
440 IXGBE_WRITE_FLUSH(hw);
441 udelay(100);
442
443 /* Enable tx laser; allow 100ms to light up */
444 esdp_reg &= ~IXGBE_ESDP_SDP3;
445 IXGBE_WRITE_REG(hw, IXGBE_ESDP, esdp_reg);
446 IXGBE_WRITE_FLUSH(hw);
447 msleep(100);
448
449 hw->mac.autotry_restart = false;
450 }
451}
452
453/**
416 * ixgbe_setup_mac_link_multispeed_fiber - Set MAC link speed 454 * ixgbe_setup_mac_link_multispeed_fiber - Set MAC link speed
417 * @hw: pointer to hardware structure 455 * @hw: pointer to hardware structure
418 * @speed: new link speed 456 * @speed: new link speed
@@ -440,16 +478,6 @@ s32 ixgbe_setup_mac_link_multispeed_fiber(struct ixgbe_hw *hw,
440 speed &= phy_link_speed; 478 speed &= phy_link_speed;
441 479
442 /* 480 /*
443 * When the driver changes the link speeds that it can support,
444 * it sets autotry_restart to true to indicate that we need to
445 * initiate a new autotry session with the link partner. To do
446 * so, we set the speed then disable and re-enable the tx laser, to
447 * alert the link partner that it also needs to restart autotry on its
448 * end. This is consistent with true clause 37 autoneg, which also
449 * involves a loss of signal.
450 */
451
452 /*
453 * Try each speed one by one, highest priority first. We do this in 481 * Try each speed one by one, highest priority first. We do this in
454 * software because 10gb fiber doesn't support speed autonegotiation. 482 * software because 10gb fiber doesn't support speed autonegotiation.
455 */ 483 */
@@ -466,6 +494,7 @@ s32 ixgbe_setup_mac_link_multispeed_fiber(struct ixgbe_hw *hw,
466 /* Set the module link speed */ 494 /* Set the module link speed */
467 esdp_reg |= (IXGBE_ESDP_SDP5_DIR | IXGBE_ESDP_SDP5); 495 esdp_reg |= (IXGBE_ESDP_SDP5_DIR | IXGBE_ESDP_SDP5);
468 IXGBE_WRITE_REG(hw, IXGBE_ESDP, esdp_reg); 496 IXGBE_WRITE_REG(hw, IXGBE_ESDP, esdp_reg);
497 IXGBE_WRITE_FLUSH(hw);
469 498
470 /* Allow module to change analog characteristics (1G->10G) */ 499 /* Allow module to change analog characteristics (1G->10G) */
471 msleep(40); 500 msleep(40);
@@ -478,19 +507,7 @@ s32 ixgbe_setup_mac_link_multispeed_fiber(struct ixgbe_hw *hw,
478 return status; 507 return status;
479 508
480 /* Flap the tx laser if it has not already been done */ 509 /* Flap the tx laser if it has not already been done */
481 if (hw->mac.autotry_restart) { 510 hw->mac.ops.flap_tx_laser(hw);
482 /* Disable tx laser; allow 100us to go dark per spec */
483 esdp_reg |= IXGBE_ESDP_SDP3;
484 IXGBE_WRITE_REG(hw, IXGBE_ESDP, esdp_reg);
485 udelay(100);
486
487 /* Enable tx laser; allow 2ms to light up per spec */
488 esdp_reg &= ~IXGBE_ESDP_SDP3;
489 IXGBE_WRITE_REG(hw, IXGBE_ESDP, esdp_reg);
490 msleep(2);
491
492 hw->mac.autotry_restart = false;
493 }
494 511
495 /* 512 /*
496 * Wait for the controller to acquire link. Per IEEE 802.3ap, 513 * Wait for the controller to acquire link. Per IEEE 802.3ap,
@@ -525,6 +542,7 @@ s32 ixgbe_setup_mac_link_multispeed_fiber(struct ixgbe_hw *hw,
525 esdp_reg &= ~IXGBE_ESDP_SDP5; 542 esdp_reg &= ~IXGBE_ESDP_SDP5;
526 esdp_reg |= IXGBE_ESDP_SDP5_DIR; 543 esdp_reg |= IXGBE_ESDP_SDP5_DIR;
527 IXGBE_WRITE_REG(hw, IXGBE_ESDP, esdp_reg); 544 IXGBE_WRITE_REG(hw, IXGBE_ESDP, esdp_reg);
545 IXGBE_WRITE_FLUSH(hw);
528 546
529 /* Allow module to change analog characteristics (10G->1G) */ 547 /* Allow module to change analog characteristics (10G->1G) */
530 msleep(40); 548 msleep(40);
@@ -537,19 +555,7 @@ s32 ixgbe_setup_mac_link_multispeed_fiber(struct ixgbe_hw *hw,
537 return status; 555 return status;
538 556
539 /* Flap the tx laser if it has not already been done */ 557 /* Flap the tx laser if it has not already been done */
540 if (hw->mac.autotry_restart) { 558 hw->mac.ops.flap_tx_laser(hw);
541 /* Disable tx laser; allow 100us to go dark per spec */
542 esdp_reg |= IXGBE_ESDP_SDP3;
543 IXGBE_WRITE_REG(hw, IXGBE_ESDP, esdp_reg);
544 udelay(100);
545
546 /* Enable tx laser; allow 2ms to light up per spec */
547 esdp_reg &= ~IXGBE_ESDP_SDP3;
548 IXGBE_WRITE_REG(hw, IXGBE_ESDP, esdp_reg);
549 msleep(2);
550
551 hw->mac.autotry_restart = false;
552 }
553 559
554 /* Wait for the link partner to also set speed */ 560 /* Wait for the link partner to also set speed */
555 msleep(100); 561 msleep(100);
diff --git a/drivers/net/ixgbe/ixgbe_fcoe.c b/drivers/net/ixgbe/ixgbe_fcoe.c
index 4123dec0dfb7..700cfc0aa1b9 100644
--- a/drivers/net/ixgbe/ixgbe_fcoe.c
+++ b/drivers/net/ixgbe/ixgbe_fcoe.c
@@ -614,9 +614,9 @@ int ixgbe_fcoe_enable(struct net_device *netdev)
614 netdev->vlan_features |= NETIF_F_FSO; 614 netdev->vlan_features |= NETIF_F_FSO;
615 netdev->vlan_features |= NETIF_F_FCOE_MTU; 615 netdev->vlan_features |= NETIF_F_FCOE_MTU;
616 netdev->fcoe_ddp_xid = IXGBE_FCOE_DDP_MAX - 1; 616 netdev->fcoe_ddp_xid = IXGBE_FCOE_DDP_MAX - 1;
617 netdev_features_change(netdev);
618 617
619 ixgbe_init_interrupt_scheme(adapter); 618 ixgbe_init_interrupt_scheme(adapter);
619 netdev_features_change(netdev);
620 620
621 if (netif_running(netdev)) 621 if (netif_running(netdev))
622 netdev->netdev_ops->ndo_open(netdev); 622 netdev->netdev_ops->ndo_open(netdev);
@@ -660,11 +660,11 @@ int ixgbe_fcoe_disable(struct net_device *netdev)
660 netdev->vlan_features &= ~NETIF_F_FSO; 660 netdev->vlan_features &= ~NETIF_F_FSO;
661 netdev->vlan_features &= ~NETIF_F_FCOE_MTU; 661 netdev->vlan_features &= ~NETIF_F_FCOE_MTU;
662 netdev->fcoe_ddp_xid = 0; 662 netdev->fcoe_ddp_xid = 0;
663 netdev_features_change(netdev);
664 663
665 ixgbe_cleanup_fcoe(adapter); 664 ixgbe_cleanup_fcoe(adapter);
666
667 ixgbe_init_interrupt_scheme(adapter); 665 ixgbe_init_interrupt_scheme(adapter);
666 netdev_features_change(netdev);
667
668 if (netif_running(netdev)) 668 if (netif_running(netdev))
669 netdev->netdev_ops->ndo_open(netdev); 669 netdev->netdev_ops->ndo_open(netdev);
670 rc = 0; 670 rc = 0;
diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index 684af371462d..d75c46ff31f6 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -935,10 +935,12 @@ static bool ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
935 if (skb->prev) 935 if (skb->prev)
936 skb = ixgbe_transform_rsc_queue(skb, &(rx_ring->rsc_count)); 936 skb = ixgbe_transform_rsc_queue(skb, &(rx_ring->rsc_count));
937 if (adapter->flags2 & IXGBE_FLAG2_RSC_ENABLED) { 937 if (adapter->flags2 & IXGBE_FLAG2_RSC_ENABLED) {
938 if (IXGBE_RSC_CB(skb)->dma) 938 if (IXGBE_RSC_CB(skb)->dma) {
939 pci_unmap_single(pdev, IXGBE_RSC_CB(skb)->dma, 939 pci_unmap_single(pdev, IXGBE_RSC_CB(skb)->dma,
940 rx_ring->rx_buf_len, 940 rx_ring->rx_buf_len,
941 PCI_DMA_FROMDEVICE); 941 PCI_DMA_FROMDEVICE);
942 IXGBE_RSC_CB(skb)->dma = 0;
943 }
942 if (rx_ring->flags & IXGBE_RING_RX_PS_ENABLED) 944 if (rx_ring->flags & IXGBE_RING_RX_PS_ENABLED)
943 rx_ring->rsc_count += skb_shinfo(skb)->nr_frags; 945 rx_ring->rsc_count += skb_shinfo(skb)->nr_frags;
944 else 946 else
@@ -3126,10 +3128,12 @@ static void ixgbe_clean_rx_ring(struct ixgbe_adapter *adapter,
3126 rx_buffer_info->skb = NULL; 3128 rx_buffer_info->skb = NULL;
3127 do { 3129 do {
3128 struct sk_buff *this = skb; 3130 struct sk_buff *this = skb;
3129 if (IXGBE_RSC_CB(this)->dma) 3131 if (IXGBE_RSC_CB(this)->dma) {
3130 pci_unmap_single(pdev, IXGBE_RSC_CB(this)->dma, 3132 pci_unmap_single(pdev, IXGBE_RSC_CB(this)->dma,
3131 rx_ring->rx_buf_len, 3133 rx_ring->rx_buf_len,
3132 PCI_DMA_FROMDEVICE); 3134 PCI_DMA_FROMDEVICE);
3135 IXGBE_RSC_CB(this)->dma = 0;
3136 }
3133 skb = skb->prev; 3137 skb = skb->prev;
3134 dev_kfree_skb(this); 3138 dev_kfree_skb(this);
3135 } while (skb); 3139 } while (skb);
@@ -5018,6 +5022,7 @@ static void ixgbe_multispeed_fiber_task(struct work_struct *work)
5018 autoneg = hw->phy.autoneg_advertised; 5022 autoneg = hw->phy.autoneg_advertised;
5019 if ((!autoneg) && (hw->mac.ops.get_link_capabilities)) 5023 if ((!autoneg) && (hw->mac.ops.get_link_capabilities))
5020 hw->mac.ops.get_link_capabilities(hw, &autoneg, &negotiation); 5024 hw->mac.ops.get_link_capabilities(hw, &autoneg, &negotiation);
5025 hw->mac.autotry_restart = false;
5021 if (hw->mac.ops.setup_link) 5026 if (hw->mac.ops.setup_link)
5022 hw->mac.ops.setup_link(hw, autoneg, negotiation, true); 5027 hw->mac.ops.setup_link(hw, autoneg, negotiation, true);
5023 adapter->flags |= IXGBE_FLAG_NEED_LINK_UPDATE; 5028 adapter->flags |= IXGBE_FLAG_NEED_LINK_UPDATE;
@@ -6245,9 +6250,6 @@ static int __devinit ixgbe_probe(struct pci_dev *pdev,
6245 case IXGBE_DEV_ID_82599_KX4: 6250 case IXGBE_DEV_ID_82599_KX4:
6246 adapter->wol = (IXGBE_WUFC_MAG | IXGBE_WUFC_EX | 6251 adapter->wol = (IXGBE_WUFC_MAG | IXGBE_WUFC_EX |
6247 IXGBE_WUFC_MC | IXGBE_WUFC_BC); 6252 IXGBE_WUFC_MC | IXGBE_WUFC_BC);
6248 /* Enable ACPI wakeup in GRC */
6249 IXGBE_WRITE_REG(hw, IXGBE_GRC,
6250 (IXGBE_READ_REG(hw, IXGBE_GRC) & ~IXGBE_GRC_APME));
6251 break; 6253 break;
6252 default: 6254 default:
6253 adapter->wol = 0; 6255 adapter->wol = 0;
@@ -6380,6 +6382,16 @@ static void __devexit ixgbe_remove(struct pci_dev *pdev)
6380 del_timer_sync(&adapter->sfp_timer); 6382 del_timer_sync(&adapter->sfp_timer);
6381 cancel_work_sync(&adapter->watchdog_task); 6383 cancel_work_sync(&adapter->watchdog_task);
6382 cancel_work_sync(&adapter->sfp_task); 6384 cancel_work_sync(&adapter->sfp_task);
6385 if (adapter->hw.phy.multispeed_fiber) {
6386 struct ixgbe_hw *hw = &adapter->hw;
6387 /*
6388 * Restart clause 37 autoneg, disable and re-enable
6389 * the tx laser, to clear & alert the link partner
6390 * that it needs to restart autotry
6391 */
6392 hw->mac.autotry_restart = true;
6393 hw->mac.ops.flap_tx_laser(hw);
6394 }
6383 cancel_work_sync(&adapter->multispeed_fiber_task); 6395 cancel_work_sync(&adapter->multispeed_fiber_task);
6384 cancel_work_sync(&adapter->sfp_config_module_task); 6396 cancel_work_sync(&adapter->sfp_config_module_task);
6385 if (adapter->flags & IXGBE_FLAG_FDIR_HASH_CAPABLE || 6397 if (adapter->flags & IXGBE_FLAG_FDIR_HASH_CAPABLE ||
diff --git a/drivers/net/ixgbe/ixgbe_type.h b/drivers/net/ixgbe/ixgbe_type.h
index 2be907466593..0ed5ab37cc53 100644
--- a/drivers/net/ixgbe/ixgbe_type.h
+++ b/drivers/net/ixgbe/ixgbe_type.h
@@ -2397,6 +2397,7 @@ struct ixgbe_mac_operations {
2397 s32 (*enable_rx_dma)(struct ixgbe_hw *, u32); 2397 s32 (*enable_rx_dma)(struct ixgbe_hw *, u32);
2398 2398
2399 /* Link */ 2399 /* Link */
2400 void (*flap_tx_laser)(struct ixgbe_hw *);
2400 s32 (*setup_link)(struct ixgbe_hw *, ixgbe_link_speed, bool, bool); 2401 s32 (*setup_link)(struct ixgbe_hw *, ixgbe_link_speed, bool, bool);
2401 s32 (*check_link)(struct ixgbe_hw *, ixgbe_link_speed *, bool *, bool); 2402 s32 (*check_link)(struct ixgbe_hw *, ixgbe_link_speed *, bool *, bool);
2402 s32 (*get_link_capabilities)(struct ixgbe_hw *, ixgbe_link_speed *, 2403 s32 (*get_link_capabilities)(struct ixgbe_hw *, ixgbe_link_speed *,
diff --git a/drivers/net/ixgbevf/ethtool.c b/drivers/net/ixgbevf/ethtool.c
index 399be0c34c36..6fdd651abcd1 100644
--- a/drivers/net/ixgbevf/ethtool.c
+++ b/drivers/net/ixgbevf/ethtool.c
@@ -46,22 +46,32 @@ struct ixgbe_stats {
46 int sizeof_stat; 46 int sizeof_stat;
47 int stat_offset; 47 int stat_offset;
48 int base_stat_offset; 48 int base_stat_offset;
49 int saved_reset_offset;
49}; 50};
50 51
51#define IXGBEVF_STAT(m, b) sizeof(((struct ixgbevf_adapter *)0)->m), \ 52#define IXGBEVF_STAT(m, b, r) sizeof(((struct ixgbevf_adapter *)0)->m), \
52 offsetof(struct ixgbevf_adapter, m), \ 53 offsetof(struct ixgbevf_adapter, m), \
53 offsetof(struct ixgbevf_adapter, b) 54 offsetof(struct ixgbevf_adapter, b), \
55 offsetof(struct ixgbevf_adapter, r)
54static struct ixgbe_stats ixgbe_gstrings_stats[] = { 56static struct ixgbe_stats ixgbe_gstrings_stats[] = {
55 {"rx_packets", IXGBEVF_STAT(stats.vfgprc, stats.base_vfgprc)}, 57 {"rx_packets", IXGBEVF_STAT(stats.vfgprc, stats.base_vfgprc,
56 {"tx_packets", IXGBEVF_STAT(stats.vfgptc, stats.base_vfgptc)}, 58 stats.saved_reset_vfgprc)},
57 {"rx_bytes", IXGBEVF_STAT(stats.vfgorc, stats.base_vfgorc)}, 59 {"tx_packets", IXGBEVF_STAT(stats.vfgptc, stats.base_vfgptc,
58 {"tx_bytes", IXGBEVF_STAT(stats.vfgotc, stats.base_vfgotc)}, 60 stats.saved_reset_vfgptc)},
59 {"tx_busy", IXGBEVF_STAT(tx_busy, zero_base)}, 61 {"rx_bytes", IXGBEVF_STAT(stats.vfgorc, stats.base_vfgorc,
60 {"multicast", IXGBEVF_STAT(stats.vfmprc, stats.base_vfmprc)}, 62 stats.saved_reset_vfgorc)},
61 {"rx_csum_offload_good", IXGBEVF_STAT(hw_csum_rx_good, zero_base)}, 63 {"tx_bytes", IXGBEVF_STAT(stats.vfgotc, stats.base_vfgotc,
62 {"rx_csum_offload_errors", IXGBEVF_STAT(hw_csum_rx_error, zero_base)}, 64 stats.saved_reset_vfgotc)},
63 {"tx_csum_offload_ctxt", IXGBEVF_STAT(hw_csum_tx_good, zero_base)}, 65 {"tx_busy", IXGBEVF_STAT(tx_busy, zero_base, zero_base)},
64 {"rx_header_split", IXGBEVF_STAT(rx_hdr_split, zero_base)}, 66 {"multicast", IXGBEVF_STAT(stats.vfmprc, stats.base_vfmprc,
67 stats.saved_reset_vfmprc)},
68 {"rx_csum_offload_good", IXGBEVF_STAT(hw_csum_rx_good, zero_base,
69 zero_base)},
70 {"rx_csum_offload_errors", IXGBEVF_STAT(hw_csum_rx_error, zero_base,
71 zero_base)},
72 {"tx_csum_offload_ctxt", IXGBEVF_STAT(hw_csum_tx_good, zero_base,
73 zero_base)},
74 {"rx_header_split", IXGBEVF_STAT(rx_hdr_split, zero_base, zero_base)},
65}; 75};
66 76
67#define IXGBE_QUEUE_STATS_LEN 0 77#define IXGBE_QUEUE_STATS_LEN 0
@@ -455,10 +465,14 @@ static void ixgbevf_get_ethtool_stats(struct net_device *netdev,
455 ixgbe_gstrings_stats[i].stat_offset; 465 ixgbe_gstrings_stats[i].stat_offset;
456 char *b = (char *)adapter + 466 char *b = (char *)adapter +
457 ixgbe_gstrings_stats[i].base_stat_offset; 467 ixgbe_gstrings_stats[i].base_stat_offset;
468 char *r = (char *)adapter +
469 ixgbe_gstrings_stats[i].saved_reset_offset;
458 data[i] = ((ixgbe_gstrings_stats[i].sizeof_stat == 470 data[i] = ((ixgbe_gstrings_stats[i].sizeof_stat ==
459 sizeof(u64)) ? *(u64 *)p : *(u32 *)p) - 471 sizeof(u64)) ? *(u64 *)p : *(u32 *)p) -
460 ((ixgbe_gstrings_stats[i].sizeof_stat == 472 ((ixgbe_gstrings_stats[i].sizeof_stat ==
461 sizeof(u64)) ? *(u64 *)b : *(u32 *)b); 473 sizeof(u64)) ? *(u64 *)b : *(u32 *)b) +
474 ((ixgbe_gstrings_stats[i].sizeof_stat ==
475 sizeof(u64)) ? *(u64 *)r : *(u32 *)r);
462 } 476 }
463} 477}
464 478
diff --git a/drivers/net/ixgbevf/ixgbevf_main.c b/drivers/net/ixgbevf/ixgbevf_main.c
index ca653c49b765..d6cbd943a6f0 100644
--- a/drivers/net/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ixgbevf/ixgbevf_main.c
@@ -965,7 +965,7 @@ static irqreturn_t ixgbevf_msix_mbx(int irq, void *data)
965 965
966 if ((msg & IXGBE_MBVFICR_VFREQ_MASK) == IXGBE_PF_CONTROL_MSG) 966 if ((msg & IXGBE_MBVFICR_VFREQ_MASK) == IXGBE_PF_CONTROL_MSG)
967 mod_timer(&adapter->watchdog_timer, 967 mod_timer(&adapter->watchdog_timer,
968 round_jiffies(jiffies + 10)); 968 round_jiffies(jiffies + 1));
969 969
970 return IRQ_HANDLED; 970 return IRQ_HANDLED;
971} 971}
@@ -1610,6 +1610,44 @@ static inline void ixgbevf_rx_desc_queue_enable(struct ixgbevf_adapter *adapter,
1610 (adapter->rx_ring[rxr].count - 1)); 1610 (adapter->rx_ring[rxr].count - 1));
1611} 1611}
1612 1612
1613static void ixgbevf_save_reset_stats(struct ixgbevf_adapter *adapter)
1614{
1615 /* Only save pre-reset stats if there are some */
1616 if (adapter->stats.vfgprc || adapter->stats.vfgptc) {
1617 adapter->stats.saved_reset_vfgprc += adapter->stats.vfgprc -
1618 adapter->stats.base_vfgprc;
1619 adapter->stats.saved_reset_vfgptc += adapter->stats.vfgptc -
1620 adapter->stats.base_vfgptc;
1621 adapter->stats.saved_reset_vfgorc += adapter->stats.vfgorc -
1622 adapter->stats.base_vfgorc;
1623 adapter->stats.saved_reset_vfgotc += adapter->stats.vfgotc -
1624 adapter->stats.base_vfgotc;
1625 adapter->stats.saved_reset_vfmprc += adapter->stats.vfmprc -
1626 adapter->stats.base_vfmprc;
1627 }
1628}
1629
1630static void ixgbevf_init_last_counter_stats(struct ixgbevf_adapter *adapter)
1631{
1632 struct ixgbe_hw *hw = &adapter->hw;
1633
1634 adapter->stats.last_vfgprc = IXGBE_READ_REG(hw, IXGBE_VFGPRC);
1635 adapter->stats.last_vfgorc = IXGBE_READ_REG(hw, IXGBE_VFGORC_LSB);
1636 adapter->stats.last_vfgorc |=
1637 (((u64)(IXGBE_READ_REG(hw, IXGBE_VFGORC_MSB))) << 32);
1638 adapter->stats.last_vfgptc = IXGBE_READ_REG(hw, IXGBE_VFGPTC);
1639 adapter->stats.last_vfgotc = IXGBE_READ_REG(hw, IXGBE_VFGOTC_LSB);
1640 adapter->stats.last_vfgotc |=
1641 (((u64)(IXGBE_READ_REG(hw, IXGBE_VFGOTC_MSB))) << 32);
1642 adapter->stats.last_vfmprc = IXGBE_READ_REG(hw, IXGBE_VFMPRC);
1643
1644 adapter->stats.base_vfgprc = adapter->stats.last_vfgprc;
1645 adapter->stats.base_vfgorc = adapter->stats.last_vfgorc;
1646 adapter->stats.base_vfgptc = adapter->stats.last_vfgptc;
1647 adapter->stats.base_vfgotc = adapter->stats.last_vfgotc;
1648 adapter->stats.base_vfmprc = adapter->stats.last_vfmprc;
1649}
1650
1613static int ixgbevf_up_complete(struct ixgbevf_adapter *adapter) 1651static int ixgbevf_up_complete(struct ixgbevf_adapter *adapter)
1614{ 1652{
1615 struct net_device *netdev = adapter->netdev; 1653 struct net_device *netdev = adapter->netdev;
@@ -1656,6 +1694,9 @@ static int ixgbevf_up_complete(struct ixgbevf_adapter *adapter)
1656 /* enable transmits */ 1694 /* enable transmits */
1657 netif_tx_start_all_queues(netdev); 1695 netif_tx_start_all_queues(netdev);
1658 1696
1697 ixgbevf_save_reset_stats(adapter);
1698 ixgbevf_init_last_counter_stats(adapter);
1699
1659 /* bring the link up in the watchdog, this could race with our first 1700 /* bring the link up in the watchdog, this could race with our first
1660 * link up interrupt but shouldn't be a problem */ 1701 * link up interrupt but shouldn't be a problem */
1661 adapter->flags |= IXGBE_FLAG_NEED_LINK_UPDATE; 1702 adapter->flags |= IXGBE_FLAG_NEED_LINK_UPDATE;
@@ -2228,27 +2269,6 @@ out:
2228 return err; 2269 return err;
2229} 2270}
2230 2271
2231static void ixgbevf_init_last_counter_stats(struct ixgbevf_adapter *adapter)
2232{
2233 struct ixgbe_hw *hw = &adapter->hw;
2234
2235 adapter->stats.last_vfgprc = IXGBE_READ_REG(hw, IXGBE_VFGPRC);
2236 adapter->stats.last_vfgorc = IXGBE_READ_REG(hw, IXGBE_VFGORC_LSB);
2237 adapter->stats.last_vfgorc |=
2238 (((u64)(IXGBE_READ_REG(hw, IXGBE_VFGORC_MSB))) << 32);
2239 adapter->stats.last_vfgptc = IXGBE_READ_REG(hw, IXGBE_VFGPTC);
2240 adapter->stats.last_vfgotc = IXGBE_READ_REG(hw, IXGBE_VFGOTC_LSB);
2241 adapter->stats.last_vfgotc |=
2242 (((u64)(IXGBE_READ_REG(hw, IXGBE_VFGOTC_MSB))) << 32);
2243 adapter->stats.last_vfmprc = IXGBE_READ_REG(hw, IXGBE_VFMPRC);
2244
2245 adapter->stats.base_vfgprc = adapter->stats.last_vfgprc;
2246 adapter->stats.base_vfgorc = adapter->stats.last_vfgorc;
2247 adapter->stats.base_vfgptc = adapter->stats.last_vfgptc;
2248 adapter->stats.base_vfgotc = adapter->stats.last_vfgotc;
2249 adapter->stats.base_vfmprc = adapter->stats.last_vfmprc;
2250}
2251
2252#define UPDATE_VF_COUNTER_32bit(reg, last_counter, counter) \ 2272#define UPDATE_VF_COUNTER_32bit(reg, last_counter, counter) \
2253 { \ 2273 { \
2254 u32 current_counter = IXGBE_READ_REG(hw, reg); \ 2274 u32 current_counter = IXGBE_READ_REG(hw, reg); \
@@ -2399,7 +2419,7 @@ static void ixgbevf_watchdog_task(struct work_struct *work)
2399 if (!netif_carrier_ok(netdev)) { 2419 if (!netif_carrier_ok(netdev)) {
2400 hw_dbg(&adapter->hw, "NIC Link is Up %s, ", 2420 hw_dbg(&adapter->hw, "NIC Link is Up %s, ",
2401 ((link_speed == IXGBE_LINK_SPEED_10GB_FULL) ? 2421 ((link_speed == IXGBE_LINK_SPEED_10GB_FULL) ?
2402 "10 Gbps" : "1 Gbps")); 2422 "10 Gbps\n" : "1 Gbps\n"));
2403 netif_carrier_on(netdev); 2423 netif_carrier_on(netdev);
2404 netif_tx_wake_all_queues(netdev); 2424 netif_tx_wake_all_queues(netdev);
2405 } else { 2425 } else {
@@ -2416,9 +2436,9 @@ static void ixgbevf_watchdog_task(struct work_struct *work)
2416 } 2436 }
2417 } 2437 }
2418 2438
2419pf_has_reset:
2420 ixgbevf_update_stats(adapter); 2439 ixgbevf_update_stats(adapter);
2421 2440
2441pf_has_reset:
2422 /* Force detection of hung controller every watchdog period */ 2442 /* Force detection of hung controller every watchdog period */
2423 adapter->detect_tx_hung = true; 2443 adapter->detect_tx_hung = true;
2424 2444
@@ -2675,7 +2695,7 @@ static int ixgbevf_open(struct net_device *netdev)
2675 if (hw->adapter_stopped) { 2695 if (hw->adapter_stopped) {
2676 err = IXGBE_ERR_MBX; 2696 err = IXGBE_ERR_MBX;
2677 printk(KERN_ERR "Unable to start - perhaps the PF" 2697 printk(KERN_ERR "Unable to start - perhaps the PF"
2678 "Driver isn't up yet\n"); 2698 " Driver isn't up yet\n");
2679 goto err_setup_reset; 2699 goto err_setup_reset;
2680 } 2700 }
2681 } 2701 }
@@ -3390,8 +3410,6 @@ static int __devinit ixgbevf_probe(struct pci_dev *pdev,
3390 /* setup the private structure */ 3410 /* setup the private structure */
3391 err = ixgbevf_sw_init(adapter); 3411 err = ixgbevf_sw_init(adapter);
3392 3412
3393 ixgbevf_init_last_counter_stats(adapter);
3394
3395#ifdef MAX_SKB_FRAGS 3413#ifdef MAX_SKB_FRAGS
3396 netdev->features = NETIF_F_SG | 3414 netdev->features = NETIF_F_SG |
3397 NETIF_F_IP_CSUM | 3415 NETIF_F_IP_CSUM |
@@ -3449,6 +3467,8 @@ static int __devinit ixgbevf_probe(struct pci_dev *pdev,
3449 3467
3450 adapter->netdev_registered = true; 3468 adapter->netdev_registered = true;
3451 3469
3470 ixgbevf_init_last_counter_stats(adapter);
3471
3452 /* print the MAC address */ 3472 /* print the MAC address */
3453 hw_dbg(hw, "%2.2x:%2.2x:%2.2x:%2.2x:%2.2x:%2.2x\n", 3473 hw_dbg(hw, "%2.2x:%2.2x:%2.2x:%2.2x:%2.2x:%2.2x\n",
3454 netdev->dev_addr[0], 3474 netdev->dev_addr[0],
diff --git a/drivers/net/ixgbevf/vf.h b/drivers/net/ixgbevf/vf.h
index 799600e92700..1f31b052d4b4 100644
--- a/drivers/net/ixgbevf/vf.h
+++ b/drivers/net/ixgbevf/vf.h
@@ -157,6 +157,12 @@ struct ixgbevf_hw_stats {
157 u64 vfgorc; 157 u64 vfgorc;
158 u64 vfgotc; 158 u64 vfgotc;
159 u64 vfmprc; 159 u64 vfmprc;
160
161 u64 saved_reset_vfgprc;
162 u64 saved_reset_vfgptc;
163 u64 saved_reset_vfgorc;
164 u64 saved_reset_vfgotc;
165 u64 saved_reset_vfmprc;
160}; 166};
161 167
162struct ixgbevf_info { 168struct ixgbevf_info {
diff --git a/drivers/net/jme.c b/drivers/net/jme.c
index 0f31497833df..c0b59a555384 100644
--- a/drivers/net/jme.c
+++ b/drivers/net/jme.c
@@ -946,6 +946,8 @@ jme_alloc_and_feed_skb(struct jme_adapter *jme, int idx)
946 jme->jme_vlan_rx(skb, jme->vlgrp, 946 jme->jme_vlan_rx(skb, jme->vlgrp,
947 le16_to_cpu(rxdesc->descwb.vlan)); 947 le16_to_cpu(rxdesc->descwb.vlan));
948 NET_STAT(jme).rx_bytes += 4; 948 NET_STAT(jme).rx_bytes += 4;
949 } else {
950 dev_kfree_skb(skb);
949 } 951 }
950 } else { 952 } else {
951 jme->jme_rx(skb); 953 jme->jme_rx(skb);
@@ -2081,12 +2083,45 @@ jme_tx_timeout(struct net_device *netdev)
2081 jme_reset_link(jme); 2083 jme_reset_link(jme);
2082} 2084}
2083 2085
2086static inline void jme_pause_rx(struct jme_adapter *jme)
2087{
2088 atomic_dec(&jme->link_changing);
2089
2090 jme_set_rx_pcc(jme, PCC_OFF);
2091 if (test_bit(JME_FLAG_POLL, &jme->flags)) {
2092 JME_NAPI_DISABLE(jme);
2093 } else {
2094 tasklet_disable(&jme->rxclean_task);
2095 tasklet_disable(&jme->rxempty_task);
2096 }
2097}
2098
2099static inline void jme_resume_rx(struct jme_adapter *jme)
2100{
2101 struct dynpcc_info *dpi = &(jme->dpi);
2102
2103 if (test_bit(JME_FLAG_POLL, &jme->flags)) {
2104 JME_NAPI_ENABLE(jme);
2105 } else {
2106 tasklet_hi_enable(&jme->rxclean_task);
2107 tasklet_hi_enable(&jme->rxempty_task);
2108 }
2109 dpi->cur = PCC_P1;
2110 dpi->attempt = PCC_P1;
2111 dpi->cnt = 0;
2112 jme_set_rx_pcc(jme, PCC_P1);
2113
2114 atomic_inc(&jme->link_changing);
2115}
2116
2084static void 2117static void
2085jme_vlan_rx_register(struct net_device *netdev, struct vlan_group *grp) 2118jme_vlan_rx_register(struct net_device *netdev, struct vlan_group *grp)
2086{ 2119{
2087 struct jme_adapter *jme = netdev_priv(netdev); 2120 struct jme_adapter *jme = netdev_priv(netdev);
2088 2121
2122 jme_pause_rx(jme);
2089 jme->vlgrp = grp; 2123 jme->vlgrp = grp;
2124 jme_resume_rx(jme);
2090} 2125}
2091 2126
2092static void 2127static void
diff --git a/drivers/net/jme.h b/drivers/net/jme.h
index c19db9146a2f..07ad3a457185 100644
--- a/drivers/net/jme.h
+++ b/drivers/net/jme.h
@@ -25,7 +25,7 @@
25#define __JME_H_INCLUDED__ 25#define __JME_H_INCLUDED__
26 26
27#define DRV_NAME "jme" 27#define DRV_NAME "jme"
28#define DRV_VERSION "1.0.5" 28#define DRV_VERSION "1.0.6"
29#define PFX DRV_NAME ": " 29#define PFX DRV_NAME ": "
30 30
31#define PCI_DEVICE_ID_JMICRON_JMC250 0x0250 31#define PCI_DEVICE_ID_JMICRON_JMC250 0x0250
diff --git a/drivers/net/ks8851.c b/drivers/net/ks8851.c
index 0573e0bb4444..13cc1ca261d9 100644
--- a/drivers/net/ks8851.c
+++ b/drivers/net/ks8851.c
@@ -976,7 +976,6 @@ static void ks8851_set_rx_mode(struct net_device *dev)
976 crc >>= (32 - 6); /* get top six bits */ 976 crc >>= (32 - 6); /* get top six bits */
977 977
978 rxctrl.mchash[crc >> 4] |= (1 << (crc & 0xf)); 978 rxctrl.mchash[crc >> 4] |= (1 << (crc & 0xf));
979 mcptr = mcptr->next;
980 } 979 }
981 980
982 rxctrl.rxcr1 = RXCR1_RXME | RXCR1_RXPAFMA; 981 rxctrl.rxcr1 = RXCR1_RXME | RXCR1_RXPAFMA;
diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c
index 8f6e816a7395..b402a95c87c7 100644
--- a/drivers/net/mlx4/main.c
+++ b/drivers/net/mlx4/main.c
@@ -1023,6 +1023,7 @@ static int mlx4_init_port_info(struct mlx4_dev *dev, int port)
1023 info->port_attr.attr.mode = S_IRUGO | S_IWUSR; 1023 info->port_attr.attr.mode = S_IRUGO | S_IWUSR;
1024 info->port_attr.show = show_port_type; 1024 info->port_attr.show = show_port_type;
1025 info->port_attr.store = set_port_type; 1025 info->port_attr.store = set_port_type;
1026 sysfs_attr_init(&info->port_attr.attr);
1026 1027
1027 err = device_create_file(&dev->pdev->dev, &info->port_attr); 1028 err = device_create_file(&dev->pdev->dev, &info->port_attr);
1028 if (err) { 1029 if (err) {
diff --git a/drivers/net/usb/smsc95xx.c b/drivers/net/usb/smsc95xx.c
index d222d7e25273..73f9a31cf94d 100644
--- a/drivers/net/usb/smsc95xx.c
+++ b/drivers/net/usb/smsc95xx.c
@@ -1189,9 +1189,21 @@ static struct sk_buff *smsc95xx_tx_fixup(struct usbnet *dev,
1189 } 1189 }
1190 1190
1191 if (csum) { 1191 if (csum) {
1192 u32 csum_preamble = smsc95xx_calc_csum_preamble(skb); 1192 if (skb->len <= 45) {
1193 skb_push(skb, 4); 1193 /* workaround - hardware tx checksum does not work
1194 memcpy(skb->data, &csum_preamble, 4); 1194 * properly with extremely small packets */
1195 long csstart = skb->csum_start - skb_headroom(skb);
1196 __wsum calc = csum_partial(skb->data + csstart,
1197 skb->len - csstart, 0);
1198 *((__sum16 *)(skb->data + csstart
1199 + skb->csum_offset)) = csum_fold(calc);
1200
1201 csum = false;
1202 } else {
1203 u32 csum_preamble = smsc95xx_calc_csum_preamble(skb);
1204 skb_push(skb, 4);
1205 memcpy(skb->data, &csum_preamble, 4);
1206 }
1195 } 1207 }
1196 1208
1197 skb_push(skb, 4); 1209 skb_push(skb, 4);
diff --git a/drivers/net/wireless/ath/ath9k/xmit.c b/drivers/net/wireless/ath/ath9k/xmit.c
index b2c8207f7bc1..294b486bc3ed 100644
--- a/drivers/net/wireless/ath/ath9k/xmit.c
+++ b/drivers/net/wireless/ath/ath9k/xmit.c
@@ -1353,25 +1353,6 @@ static enum ath9k_pkt_type get_hw_packet_type(struct sk_buff *skb)
1353 return htype; 1353 return htype;
1354} 1354}
1355 1355
1356static bool is_pae(struct sk_buff *skb)
1357{
1358 struct ieee80211_hdr *hdr;
1359 __le16 fc;
1360
1361 hdr = (struct ieee80211_hdr *)skb->data;
1362 fc = hdr->frame_control;
1363
1364 if (ieee80211_is_data(fc)) {
1365 if (ieee80211_is_nullfunc(fc) ||
1366 /* Port Access Entity (IEEE 802.1X) */
1367 (skb->protocol == cpu_to_be16(ETH_P_PAE))) {
1368 return true;
1369 }
1370 }
1371
1372 return false;
1373}
1374
1375static int get_hw_crypto_keytype(struct sk_buff *skb) 1356static int get_hw_crypto_keytype(struct sk_buff *skb)
1376{ 1357{
1377 struct ieee80211_tx_info *tx_info = IEEE80211_SKB_CB(skb); 1358 struct ieee80211_tx_info *tx_info = IEEE80211_SKB_CB(skb);
@@ -1696,7 +1677,7 @@ static void ath_tx_start_dma(struct ath_softc *sc, struct ath_buf *bf,
1696 goto tx_done; 1677 goto tx_done;
1697 } 1678 }
1698 1679
1699 if ((tx_info->flags & IEEE80211_TX_CTL_AMPDU) && !is_pae(skb)) { 1680 if (tx_info->flags & IEEE80211_TX_CTL_AMPDU) {
1700 /* 1681 /*
1701 * Try aggregation if it's a unicast data frame 1682 * Try aggregation if it's a unicast data frame
1702 * and the destination is HT capable. 1683 * and the destination is HT capable.
diff --git a/drivers/net/wireless/iwlwifi/iwl-tx.c b/drivers/net/wireless/iwlwifi/iwl-tx.c
index 1ed5206721ec..8c12311dbb0a 100644
--- a/drivers/net/wireless/iwlwifi/iwl-tx.c
+++ b/drivers/net/wireless/iwlwifi/iwl-tx.c
@@ -124,7 +124,7 @@ void iwl_free_tfds_in_queue(struct iwl_priv *priv,
124 if (priv->stations[sta_id].tid[tid].tfds_in_queue >= freed) 124 if (priv->stations[sta_id].tid[tid].tfds_in_queue >= freed)
125 priv->stations[sta_id].tid[tid].tfds_in_queue -= freed; 125 priv->stations[sta_id].tid[tid].tfds_in_queue -= freed;
126 else { 126 else {
127 IWL_ERR(priv, "free more than tfds_in_queue (%u:%d)\n", 127 IWL_DEBUG_TX(priv, "free more than tfds_in_queue (%u:%d)\n",
128 priv->stations[sta_id].tid[tid].tfds_in_queue, 128 priv->stations[sta_id].tid[tid].tfds_in_queue,
129 freed); 129 freed);
130 priv->stations[sta_id].tid[tid].tfds_in_queue = 0; 130 priv->stations[sta_id].tid[tid].tfds_in_queue = 0;
diff --git a/drivers/net/wireless/wl12xx/wl1251_debugfs.c b/drivers/net/wireless/wl12xx/wl1251_debugfs.c
index 0ccba57fb9fb..05e4d68eb4cc 100644
--- a/drivers/net/wireless/wl12xx/wl1251_debugfs.c
+++ b/drivers/net/wireless/wl12xx/wl1251_debugfs.c
@@ -466,7 +466,8 @@ out:
466 466
467void wl1251_debugfs_reset(struct wl1251 *wl) 467void wl1251_debugfs_reset(struct wl1251 *wl)
468{ 468{
469 memset(wl->stats.fw_stats, 0, sizeof(*wl->stats.fw_stats)); 469 if (wl->stats.fw_stats != NULL)
470 memset(wl->stats.fw_stats, 0, sizeof(*wl->stats.fw_stats));
470 wl->stats.retry_count = 0; 471 wl->stats.retry_count = 0;
471 wl->stats.excessive_retries = 0; 472 wl->stats.excessive_retries = 0;
472} 473}
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index de296452c957..997668558e79 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -655,8 +655,8 @@ void pci_create_legacy_files(struct pci_bus *b)
655 goto legacy_io_err; 655 goto legacy_io_err;
656 656
657 /* Allocated above after the legacy_io struct */ 657 /* Allocated above after the legacy_io struct */
658 sysfs_bin_attr_init(b->legacy_mem);
659 b->legacy_mem = b->legacy_io + 1; 658 b->legacy_mem = b->legacy_io + 1;
659 sysfs_bin_attr_init(b->legacy_mem);
660 b->legacy_mem->attr.name = "legacy_mem"; 660 b->legacy_mem->attr.name = "legacy_mem";
661 b->legacy_mem->size = 1024*1024; 661 b->legacy_mem->size = 1024*1024;
662 b->legacy_mem->attr.mode = S_IRUSR | S_IWUSR; 662 b->legacy_mem->attr.mode = S_IRUSR | S_IWUSR;
diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index c7bbe30010f7..5af16c2bb540 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -1038,6 +1038,7 @@ static struct regulator *create_regulator(struct regulator_dev *rdev,
1038 goto overflow_err; 1038 goto overflow_err;
1039 1039
1040 regulator->dev = dev; 1040 regulator->dev = dev;
1041 sysfs_attr_init(&regulator->dev_attr.attr);
1041 regulator->dev_attr.attr.name = kstrdup(buf, GFP_KERNEL); 1042 regulator->dev_attr.attr.name = kstrdup(buf, GFP_KERNEL);
1042 if (regulator->dev_attr.attr.name == NULL) 1043 if (regulator->dev_attr.attr.name == NULL)
1043 goto attr_name_err; 1044 goto attr_name_err;
diff --git a/drivers/regulator/lp3971.c b/drivers/regulator/lp3971.c
index f5532ed79272..b20b3e1d821a 100644
--- a/drivers/regulator/lp3971.c
+++ b/drivers/regulator/lp3971.c
@@ -45,7 +45,7 @@ static int lp3971_set_bits(struct lp3971 *lp3971, u8 reg, u16 mask, u16 val);
45 LP3971_BUCK2 -> 4 45 LP3971_BUCK2 -> 4
46 LP3971_BUCK3 -> 6 46 LP3971_BUCK3 -> 6
47*/ 47*/
48#define BUCK_VOL_CHANGE_SHIFT(x) (((1 << x) & ~0x01) << 1) 48#define BUCK_VOL_CHANGE_SHIFT(x) (((!!x) << 2) | (x & ~0x01))
49#define BUCK_VOL_CHANGE_FLAG_GO 0x01 49#define BUCK_VOL_CHANGE_FLAG_GO 0x01
50#define BUCK_VOL_CHANGE_FLAG_TARGET 0x02 50#define BUCK_VOL_CHANGE_FLAG_TARGET 0x02
51#define BUCK_VOL_CHANGE_FLAG_MASK 0x03 51#define BUCK_VOL_CHANGE_FLAG_MASK 0x03
@@ -187,7 +187,8 @@ static int lp3971_ldo_set_voltage(struct regulator_dev *dev,
187 return -EINVAL; 187 return -EINVAL;
188 188
189 return lp3971_set_bits(lp3971, LP3971_LDO_VOL_CONTR_REG(ldo), 189 return lp3971_set_bits(lp3971, LP3971_LDO_VOL_CONTR_REG(ldo),
190 LDO_VOL_CONTR_MASK << LDO_VOL_CONTR_SHIFT(ldo), val); 190 LDO_VOL_CONTR_MASK << LDO_VOL_CONTR_SHIFT(ldo),
191 val << LDO_VOL_CONTR_SHIFT(ldo));
191} 192}
192 193
193static struct regulator_ops lp3971_ldo_ops = { 194static struct regulator_ops lp3971_ldo_ops = {
@@ -439,6 +440,10 @@ static int __devinit setup_regulators(struct lp3971 *lp3971,
439 lp3971->num_regulators = pdata->num_regulators; 440 lp3971->num_regulators = pdata->num_regulators;
440 lp3971->rdev = kcalloc(pdata->num_regulators, 441 lp3971->rdev = kcalloc(pdata->num_regulators,
441 sizeof(struct regulator_dev *), GFP_KERNEL); 442 sizeof(struct regulator_dev *), GFP_KERNEL);
443 if (!lp3971->rdev) {
444 err = -ENOMEM;
445 goto err_nomem;
446 }
442 447
443 /* Instantiate the regulators */ 448 /* Instantiate the regulators */
444 for (i = 0; i < pdata->num_regulators; i++) { 449 for (i = 0; i < pdata->num_regulators; i++) {
@@ -461,6 +466,7 @@ error:
461 regulator_unregister(lp3971->rdev[i]); 466 regulator_unregister(lp3971->rdev[i]);
462 kfree(lp3971->rdev); 467 kfree(lp3971->rdev);
463 lp3971->rdev = NULL; 468 lp3971->rdev = NULL;
469err_nomem:
464 return err; 470 return err;
465} 471}
466 472
diff --git a/drivers/regulator/max1586.c b/drivers/regulator/max1586.c
index a49fc952c9a9..c0b09e15edb6 100644
--- a/drivers/regulator/max1586.c
+++ b/drivers/regulator/max1586.c
@@ -243,8 +243,8 @@ static int __devexit max1586_pmic_remove(struct i2c_client *client)
243 for (i = 0; i <= MAX1586_V6; i++) 243 for (i = 0; i <= MAX1586_V6; i++)
244 if (rdev[i]) 244 if (rdev[i])
245 regulator_unregister(rdev[i]); 245 regulator_unregister(rdev[i]);
246 kfree(rdev);
247 i2c_set_clientdata(client, NULL); 246 i2c_set_clientdata(client, NULL);
247 kfree(rdev);
248 248
249 return 0; 249 return 0;
250} 250}
diff --git a/drivers/regulator/max8649.c b/drivers/regulator/max8649.c
index 3ebdf698c648..833aaedc7e64 100644
--- a/drivers/regulator/max8649.c
+++ b/drivers/regulator/max8649.c
@@ -356,6 +356,7 @@ static int __devinit max8649_regulator_probe(struct i2c_client *client,
356 dev_info(info->dev, "Max8649 regulator device is detected.\n"); 356 dev_info(info->dev, "Max8649 regulator device is detected.\n");
357 return 0; 357 return 0;
358out: 358out:
359 i2c_set_clientdata(client, NULL);
359 kfree(info); 360 kfree(info);
360 return ret; 361 return ret;
361} 362}
@@ -367,9 +368,9 @@ static int __devexit max8649_regulator_remove(struct i2c_client *client)
367 if (info) { 368 if (info) {
368 if (info->regulator) 369 if (info->regulator)
369 regulator_unregister(info->regulator); 370 regulator_unregister(info->regulator);
371 i2c_set_clientdata(client, NULL);
370 kfree(info); 372 kfree(info);
371 } 373 }
372 i2c_set_clientdata(client, NULL);
373 374
374 return 0; 375 return 0;
375} 376}
diff --git a/drivers/regulator/max8660.c b/drivers/regulator/max8660.c
index f12f1bb62138..47f90b2fc290 100644
--- a/drivers/regulator/max8660.c
+++ b/drivers/regulator/max8660.c
@@ -470,8 +470,8 @@ static int __devexit max8660_remove(struct i2c_client *client)
470 for (i = 0; i < MAX8660_V_END; i++) 470 for (i = 0; i < MAX8660_V_END; i++)
471 if (rdev[i]) 471 if (rdev[i])
472 regulator_unregister(rdev[i]); 472 regulator_unregister(rdev[i]);
473 kfree(rdev);
474 i2c_set_clientdata(client, NULL); 473 i2c_set_clientdata(client, NULL);
474 kfree(rdev);
475 475
476 return 0; 476 return 0;
477} 477}
diff --git a/drivers/regulator/max8925-regulator.c b/drivers/regulator/max8925-regulator.c
index 67873f08ed40..b6218f11c957 100644
--- a/drivers/regulator/max8925-regulator.c
+++ b/drivers/regulator/max8925-regulator.c
@@ -230,7 +230,7 @@ static struct max8925_regulator_info max8925_regulator_info[] = {
230 MAX8925_LDO(20, 750, 3900, 50), 230 MAX8925_LDO(20, 750, 3900, 50),
231}; 231};
232 232
233static inline struct max8925_regulator_info *find_regulator_info(int id) 233static struct max8925_regulator_info * __devinit find_regulator_info(int id)
234{ 234{
235 struct max8925_regulator_info *ri; 235 struct max8925_regulator_info *ri;
236 int i; 236 int i;
@@ -247,7 +247,7 @@ static int __devinit max8925_regulator_probe(struct platform_device *pdev)
247{ 247{
248 struct max8925_chip *chip = dev_get_drvdata(pdev->dev.parent); 248 struct max8925_chip *chip = dev_get_drvdata(pdev->dev.parent);
249 struct max8925_platform_data *pdata = chip->dev->platform_data; 249 struct max8925_platform_data *pdata = chip->dev->platform_data;
250 struct max8925_regulator_info *ri = NULL; 250 struct max8925_regulator_info *ri;
251 struct regulator_dev *rdev; 251 struct regulator_dev *rdev;
252 252
253 ri = find_regulator_info(pdev->id); 253 ri = find_regulator_info(pdev->id);
@@ -274,7 +274,9 @@ static int __devexit max8925_regulator_remove(struct platform_device *pdev)
274{ 274{
275 struct regulator_dev *rdev = platform_get_drvdata(pdev); 275 struct regulator_dev *rdev = platform_get_drvdata(pdev);
276 276
277 platform_set_drvdata(pdev, NULL);
277 regulator_unregister(rdev); 278 regulator_unregister(rdev);
279
278 return 0; 280 return 0;
279} 281}
280 282
diff --git a/drivers/rtc/rtc-mc13783.c b/drivers/rtc/rtc-mc13783.c
index d60c81b7b693..1379c7faa448 100644
--- a/drivers/rtc/rtc-mc13783.c
+++ b/drivers/rtc/rtc-mc13783.c
@@ -319,35 +319,38 @@ static int __devinit mc13783_rtc_probe(struct platform_device *pdev)
319{ 319{
320 int ret; 320 int ret;
321 struct mc13783_rtc *priv; 321 struct mc13783_rtc *priv;
322 struct mc13783 *mc13783;
322 int rtcrst_pending; 323 int rtcrst_pending;
323 324
324 priv = kzalloc(sizeof(*priv), GFP_KERNEL); 325 priv = kzalloc(sizeof(*priv), GFP_KERNEL);
325 if (!priv) 326 if (!priv)
326 return -ENOMEM; 327 return -ENOMEM;
327 328
328 priv->mc13783 = dev_get_drvdata(pdev->dev.parent); 329 mc13783 = dev_get_drvdata(pdev->dev.parent);
330 priv->mc13783 = mc13783;
331
329 platform_set_drvdata(pdev, priv); 332 platform_set_drvdata(pdev, priv);
330 333
331 mc13783_lock(priv->mc13783); 334 mc13783_lock(mc13783);
332 335
333 ret = mc13783_irq_request(priv->mc13783, MC13783_IRQ_RTCRST, 336 ret = mc13783_irq_request(mc13783, MC13783_IRQ_RTCRST,
334 mc13783_rtc_reset_handler, DRIVER_NAME, priv); 337 mc13783_rtc_reset_handler, DRIVER_NAME, priv);
335 if (ret) 338 if (ret)
336 goto err_reset_irq_request; 339 goto err_reset_irq_request;
337 340
338 ret = mc13783_irq_status(priv->mc13783, MC13783_IRQ_RTCRST, 341 ret = mc13783_irq_status(mc13783, MC13783_IRQ_RTCRST,
339 NULL, &rtcrst_pending); 342 NULL, &rtcrst_pending);
340 if (ret) 343 if (ret)
341 goto err_reset_irq_status; 344 goto err_reset_irq_status;
342 345
343 priv->valid = !rtcrst_pending; 346 priv->valid = !rtcrst_pending;
344 347
345 ret = mc13783_irq_request_nounmask(priv->mc13783, MC13783_IRQ_1HZ, 348 ret = mc13783_irq_request_nounmask(mc13783, MC13783_IRQ_1HZ,
346 mc13783_rtc_update_handler, DRIVER_NAME, priv); 349 mc13783_rtc_update_handler, DRIVER_NAME, priv);
347 if (ret) 350 if (ret)
348 goto err_update_irq_request; 351 goto err_update_irq_request;
349 352
350 ret = mc13783_irq_request_nounmask(priv->mc13783, MC13783_IRQ_TODA, 353 ret = mc13783_irq_request_nounmask(mc13783, MC13783_IRQ_TODA,
351 mc13783_rtc_alarm_handler, DRIVER_NAME, priv); 354 mc13783_rtc_alarm_handler, DRIVER_NAME, priv);
352 if (ret) 355 if (ret)
353 goto err_alarm_irq_request; 356 goto err_alarm_irq_request;
@@ -357,22 +360,22 @@ static int __devinit mc13783_rtc_probe(struct platform_device *pdev)
357 if (IS_ERR(priv->rtc)) { 360 if (IS_ERR(priv->rtc)) {
358 ret = PTR_ERR(priv->rtc); 361 ret = PTR_ERR(priv->rtc);
359 362
360 mc13783_irq_free(priv->mc13783, MC13783_IRQ_TODA, priv); 363 mc13783_irq_free(mc13783, MC13783_IRQ_TODA, priv);
361err_alarm_irq_request: 364err_alarm_irq_request:
362 365
363 mc13783_irq_free(priv->mc13783, MC13783_IRQ_1HZ, priv); 366 mc13783_irq_free(mc13783, MC13783_IRQ_1HZ, priv);
364err_update_irq_request: 367err_update_irq_request:
365 368
366err_reset_irq_status: 369err_reset_irq_status:
367 370
368 mc13783_irq_free(priv->mc13783, MC13783_IRQ_RTCRST, priv); 371 mc13783_irq_free(mc13783, MC13783_IRQ_RTCRST, priv);
369err_reset_irq_request: 372err_reset_irq_request:
370 373
371 platform_set_drvdata(pdev, NULL); 374 platform_set_drvdata(pdev, NULL);
372 kfree(priv); 375 kfree(priv);
373 } 376 }
374 377
375 mc13783_unlock(priv->mc13783); 378 mc13783_unlock(mc13783);
376 379
377 return ret; 380 return ret;
378} 381}
diff --git a/drivers/s390/block/dasd_3990_erp.c b/drivers/s390/block/dasd_3990_erp.c
index 51224f76b980..b3736b8aad39 100644
--- a/drivers/s390/block/dasd_3990_erp.c
+++ b/drivers/s390/block/dasd_3990_erp.c
@@ -2287,7 +2287,8 @@ static struct dasd_ccw_req *dasd_3990_erp_add_erp(struct dasd_ccw_req *cqr)
2287 2287
2288 if (cqr->cpmode == 1) { 2288 if (cqr->cpmode == 1) {
2289 cplength = 0; 2289 cplength = 0;
2290 datasize = sizeof(struct tcw) + sizeof(struct tsb); 2290 /* TCW needs to be 64 byte aligned, so leave enough room */
2291 datasize = 64 + sizeof(struct tcw) + sizeof(struct tsb);
2291 } else { 2292 } else {
2292 cplength = 2; 2293 cplength = 2;
2293 datasize = 0; 2294 datasize = 0;
@@ -2316,8 +2317,8 @@ static struct dasd_ccw_req *dasd_3990_erp_add_erp(struct dasd_ccw_req *cqr)
2316 if (cqr->cpmode == 1) { 2317 if (cqr->cpmode == 1) {
2317 /* make a shallow copy of the original tcw but set new tsb */ 2318 /* make a shallow copy of the original tcw but set new tsb */
2318 erp->cpmode = 1; 2319 erp->cpmode = 1;
2319 erp->cpaddr = erp->data; 2320 erp->cpaddr = PTR_ALIGN(erp->data, 64);
2320 tcw = erp->data; 2321 tcw = erp->cpaddr;
2321 tsb = (struct tsb *) &tcw[1]; 2322 tsb = (struct tsb *) &tcw[1];
2322 *tcw = *((struct tcw *)cqr->cpaddr); 2323 *tcw = *((struct tcw *)cqr->cpaddr);
2323 tcw->tsb = (long)tsb; 2324 tcw->tsb = (long)tsb;
diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c
index 01f4e7a34aa8..0cb233116855 100644
--- a/drivers/s390/block/dasd_eckd.c
+++ b/drivers/s390/block/dasd_eckd.c
@@ -3155,11 +3155,11 @@ static void dasd_eckd_dump_sense_tcw(struct dasd_device *device,
3155 3155
3156 tsb = NULL; 3156 tsb = NULL;
3157 sense = NULL; 3157 sense = NULL;
3158 if (irb->scsw.tm.tcw) 3158 if (irb->scsw.tm.tcw && (irb->scsw.tm.fcxs == 0x01))
3159 tsb = tcw_get_tsb( 3159 tsb = tcw_get_tsb(
3160 (struct tcw *)(unsigned long)irb->scsw.tm.tcw); 3160 (struct tcw *)(unsigned long)irb->scsw.tm.tcw);
3161 3161
3162 if (tsb && (irb->scsw.tm.fcxs == 0x01)) { 3162 if (tsb) {
3163 len += sprintf(page + len, KERN_ERR PRINTK_HEADER 3163 len += sprintf(page + len, KERN_ERR PRINTK_HEADER
3164 " tsb->length %d\n", tsb->length); 3164 " tsb->length %d\n", tsb->length);
3165 len += sprintf(page + len, KERN_ERR PRINTK_HEADER 3165 len += sprintf(page + len, KERN_ERR PRINTK_HEADER
diff --git a/drivers/s390/char/sclp_async.c b/drivers/s390/char/sclp_async.c
index 740fe405c395..f449c696e503 100644
--- a/drivers/s390/char/sclp_async.c
+++ b/drivers/s390/char/sclp_async.c
@@ -84,6 +84,7 @@ static int proc_handler_callhome(struct ctl_table *ctl, int write,
84 rc = copy_from_user(buf, buffer, sizeof(buf)); 84 rc = copy_from_user(buf, buffer, sizeof(buf));
85 if (rc != 0) 85 if (rc != 0)
86 return -EFAULT; 86 return -EFAULT;
87 buf[len - 1] = '\0';
87 if (strict_strtoul(buf, 0, &val) != 0) 88 if (strict_strtoul(buf, 0, &val) != 0)
88 return -EINVAL; 89 return -EINVAL;
89 if (val != 0 && val != 1) 90 if (val != 0 && val != 1)
diff --git a/drivers/s390/char/sclp_cmd.c b/drivers/s390/char/sclp_cmd.c
index fc7ae05ce48a..4b60ede07f0e 100644
--- a/drivers/s390/char/sclp_cmd.c
+++ b/drivers/s390/char/sclp_cmd.c
@@ -308,6 +308,13 @@ struct assign_storage_sccb {
308 u16 rn; 308 u16 rn;
309} __packed; 309} __packed;
310 310
311int arch_get_memory_phys_device(unsigned long start_pfn)
312{
313 if (!rzm)
314 return 0;
315 return PFN_PHYS(start_pfn) >> ilog2(rzm);
316}
317
311static unsigned long long rn2addr(u16 rn) 318static unsigned long long rn2addr(u16 rn)
312{ 319{
313 return (unsigned long long) (rn - 1) * rzm; 320 return (unsigned long long) (rn - 1) * rzm;
@@ -704,13 +711,6 @@ int sclp_chp_deconfigure(struct chp_id chpid)
704 return do_chp_configure(SCLP_CMDW_DECONFIGURE_CHPATH | chpid.id << 8); 711 return do_chp_configure(SCLP_CMDW_DECONFIGURE_CHPATH | chpid.id << 8);
705} 712}
706 713
707int arch_get_memory_phys_device(unsigned long start_pfn)
708{
709 if (!rzm)
710 return 0;
711 return PFN_PHYS(start_pfn) / rzm;
712}
713
714struct chp_info_sccb { 714struct chp_info_sccb {
715 struct sccb_header header; 715 struct sccb_header header;
716 u8 recognized[SCLP_CHP_INFO_MASK_SIZE]; 716 u8 recognized[SCLP_CHP_INFO_MASK_SIZE];
diff --git a/drivers/s390/char/zcore.c b/drivers/s390/char/zcore.c
index 3438658b66b7..3166d85914f2 100644
--- a/drivers/s390/char/zcore.c
+++ b/drivers/s390/char/zcore.c
@@ -141,33 +141,6 @@ static int memcpy_hsa_kernel(void *dest, unsigned long src, size_t count)
141 return memcpy_hsa(dest, src, count, TO_KERNEL); 141 return memcpy_hsa(dest, src, count, TO_KERNEL);
142} 142}
143 143
144static int memcpy_real(void *dest, unsigned long src, size_t count)
145{
146 unsigned long flags;
147 int rc = -EFAULT;
148 register unsigned long _dest asm("2") = (unsigned long) dest;
149 register unsigned long _len1 asm("3") = (unsigned long) count;
150 register unsigned long _src asm("4") = src;
151 register unsigned long _len2 asm("5") = (unsigned long) count;
152
153 if (count == 0)
154 return 0;
155 flags = __raw_local_irq_stnsm(0xf8UL); /* switch to real mode */
156 asm volatile (
157 "0: mvcle %1,%2,0x0\n"
158 "1: jo 0b\n"
159 " lhi %0,0x0\n"
160 "2:\n"
161 EX_TABLE(1b,2b)
162 : "+d" (rc), "+d" (_dest), "+d" (_src), "+d" (_len1),
163 "+d" (_len2), "=m" (*((long*)dest))
164 : "m" (*((long*)src))
165 : "cc", "memory");
166 __raw_local_irq_ssm(flags);
167
168 return rc;
169}
170
171static int memcpy_real_user(void __user *dest, unsigned long src, size_t count) 144static int memcpy_real_user(void __user *dest, unsigned long src, size_t count)
172{ 145{
173 static char buf[4096]; 146 static char buf[4096];
@@ -175,7 +148,7 @@ static int memcpy_real_user(void __user *dest, unsigned long src, size_t count)
175 148
176 while (offs < count) { 149 while (offs < count) {
177 size = min(sizeof(buf), count - offs); 150 size = min(sizeof(buf), count - offs);
178 if (memcpy_real(buf, src + offs, size)) 151 if (memcpy_real(buf, (void *) src + offs, size))
179 return -EFAULT; 152 return -EFAULT;
180 if (copy_to_user(dest + offs, buf, size)) 153 if (copy_to_user(dest + offs, buf, size))
181 return -EFAULT; 154 return -EFAULT;
@@ -663,7 +636,7 @@ static int __init zcore_reipl_init(void)
663 if (ipib_info.ipib < ZFCPDUMP_HSA_SIZE) 636 if (ipib_info.ipib < ZFCPDUMP_HSA_SIZE)
664 rc = memcpy_hsa_kernel(ipl_block, ipib_info.ipib, PAGE_SIZE); 637 rc = memcpy_hsa_kernel(ipl_block, ipib_info.ipib, PAGE_SIZE);
665 else 638 else
666 rc = memcpy_real(ipl_block, ipib_info.ipib, PAGE_SIZE); 639 rc = memcpy_real(ipl_block, (void *) ipib_info.ipib, PAGE_SIZE);
667 if (rc) { 640 if (rc) {
668 free_page((unsigned long) ipl_block); 641 free_page((unsigned long) ipl_block);
669 return rc; 642 return rc;
diff --git a/drivers/serial/cpm_uart/cpm_uart_cpm2.c b/drivers/serial/cpm_uart/cpm_uart_cpm2.c
index a9802e76b5fa..722eac18f382 100644
--- a/drivers/serial/cpm_uart/cpm_uart_cpm2.c
+++ b/drivers/serial/cpm_uart/cpm_uart_cpm2.c
@@ -61,7 +61,7 @@ void __iomem *cpm_uart_map_pram(struct uart_cpm_port *port,
61 void __iomem *pram; 61 void __iomem *pram;
62 unsigned long offset; 62 unsigned long offset;
63 struct resource res; 63 struct resource res;
64 unsigned long len; 64 resource_size_t len;
65 65
66 /* Don't remap parameter RAM if it has already been initialized 66 /* Don't remap parameter RAM if it has already been initialized
67 * during console setup. 67 * during console setup.
@@ -74,7 +74,7 @@ void __iomem *cpm_uart_map_pram(struct uart_cpm_port *port,
74 if (of_address_to_resource(np, 1, &res)) 74 if (of_address_to_resource(np, 1, &res))
75 return NULL; 75 return NULL;
76 76
77 len = 1 + res.end - res.start; 77 len = resource_size(&res);
78 pram = ioremap(res.start, len); 78 pram = ioremap(res.start, len);
79 if (!pram) 79 if (!pram)
80 return NULL; 80 return NULL;
diff --git a/drivers/serial/sh-sci.c b/drivers/serial/sh-sci.c
index 980f39449ee5..291bc08e2e84 100644
--- a/drivers/serial/sh-sci.c
+++ b/drivers/serial/sh-sci.c
@@ -50,7 +50,6 @@
50#include <linux/list.h> 50#include <linux/list.h>
51#include <linux/dmaengine.h> 51#include <linux/dmaengine.h>
52#include <linux/scatterlist.h> 52#include <linux/scatterlist.h>
53#include <linux/timer.h>
54 53
55#ifdef CONFIG_SUPERH 54#ifdef CONFIG_SUPERH
56#include <asm/sh_bios.h> 55#include <asm/sh_bios.h>
@@ -83,16 +82,16 @@ struct sci_port {
83 82
84 /* Interface clock */ 83 /* Interface clock */
85 struct clk *iclk; 84 struct clk *iclk;
86 /* Data clock */ 85 /* Function clock */
87 struct clk *dclk; 86 struct clk *fclk;
88 87
89 struct list_head node; 88 struct list_head node;
90 struct dma_chan *chan_tx; 89 struct dma_chan *chan_tx;
91 struct dma_chan *chan_rx; 90 struct dma_chan *chan_rx;
92#ifdef CONFIG_SERIAL_SH_SCI_DMA 91#ifdef CONFIG_SERIAL_SH_SCI_DMA
93 struct device *dma_dev; 92 struct device *dma_dev;
94 enum sh_dmae_slave_chan_id slave_tx; 93 unsigned int slave_tx;
95 enum sh_dmae_slave_chan_id slave_rx; 94 unsigned int slave_rx;
96 struct dma_async_tx_descriptor *desc_tx; 95 struct dma_async_tx_descriptor *desc_tx;
97 struct dma_async_tx_descriptor *desc_rx[2]; 96 struct dma_async_tx_descriptor *desc_rx[2];
98 dma_cookie_t cookie_tx; 97 dma_cookie_t cookie_tx;
@@ -107,6 +106,7 @@ struct sci_port {
107 struct work_struct work_tx; 106 struct work_struct work_tx;
108 struct work_struct work_rx; 107 struct work_struct work_rx;
109 struct timer_list rx_timer; 108 struct timer_list rx_timer;
109 unsigned int rx_timeout;
110#endif 110#endif
111}; 111};
112 112
@@ -674,22 +674,22 @@ static irqreturn_t sci_rx_interrupt(int irq, void *ptr)
674 struct sci_port *s = to_sci_port(port); 674 struct sci_port *s = to_sci_port(port);
675 675
676 if (s->chan_rx) { 676 if (s->chan_rx) {
677 unsigned long tout;
678 u16 scr = sci_in(port, SCSCR); 677 u16 scr = sci_in(port, SCSCR);
679 u16 ssr = sci_in(port, SCxSR); 678 u16 ssr = sci_in(port, SCxSR);
680 679
681 /* Disable future Rx interrupts */ 680 /* Disable future Rx interrupts */
682 sci_out(port, SCSCR, scr & ~SCI_CTRL_FLAGS_RIE); 681 if (port->type == PORT_SCIFA) {
682 disable_irq_nosync(irq);
683 scr |= 0x4000;
684 } else {
685 scr &= ~SCI_CTRL_FLAGS_RIE;
686 }
687 sci_out(port, SCSCR, scr);
683 /* Clear current interrupt */ 688 /* Clear current interrupt */
684 sci_out(port, SCxSR, ssr & ~(1 | SCxSR_RDxF(port))); 689 sci_out(port, SCxSR, ssr & ~(1 | SCxSR_RDxF(port)));
685 /* Calculate delay for 1.5 DMA buffers */ 690 dev_dbg(port->dev, "Rx IRQ %lu: setup t-out in %u jiffies\n",
686 tout = (port->timeout - HZ / 50) * s->buf_len_rx * 3 / 691 jiffies, s->rx_timeout);
687 port->fifosize / 2; 692 mod_timer(&s->rx_timer, jiffies + s->rx_timeout);
688 dev_dbg(port->dev, "Rx IRQ: setup timeout in %lu ms\n",
689 tout * 1000 / HZ);
690 if (tout < 2)
691 tout = 2;
692 mod_timer(&s->rx_timer, jiffies + tout);
693 693
694 return IRQ_HANDLED; 694 return IRQ_HANDLED;
695 } 695 }
@@ -780,10 +780,6 @@ static irqreturn_t sci_mpxed_interrupt(int irq, void *ptr)
780 if ((ssr_status & SCxSR_BRK(port)) && err_enabled) 780 if ((ssr_status & SCxSR_BRK(port)) && err_enabled)
781 ret = sci_br_interrupt(irq, ptr); 781 ret = sci_br_interrupt(irq, ptr);
782 782
783 WARN_ONCE(ret == IRQ_NONE,
784 "%s: %d IRQ %d, status %x, control %x\n", __func__,
785 irq, port->line, ssr_status, scr_status);
786
787 return ret; 783 return ret;
788} 784}
789 785
@@ -803,7 +799,7 @@ static int sci_notifier(struct notifier_block *self,
803 (phase == CPUFREQ_RESUMECHANGE)) { 799 (phase == CPUFREQ_RESUMECHANGE)) {
804 spin_lock_irqsave(&priv->lock, flags); 800 spin_lock_irqsave(&priv->lock, flags);
805 list_for_each_entry(sci_port, &priv->ports, node) 801 list_for_each_entry(sci_port, &priv->ports, node)
806 sci_port->port.uartclk = clk_get_rate(sci_port->dclk); 802 sci_port->port.uartclk = clk_get_rate(sci_port->iclk);
807 spin_unlock_irqrestore(&priv->lock, flags); 803 spin_unlock_irqrestore(&priv->lock, flags);
808 } 804 }
809 805
@@ -814,21 +810,17 @@ static void sci_clk_enable(struct uart_port *port)
814{ 810{
815 struct sci_port *sci_port = to_sci_port(port); 811 struct sci_port *sci_port = to_sci_port(port);
816 812
817 clk_enable(sci_port->dclk); 813 clk_enable(sci_port->iclk);
818 sci_port->port.uartclk = clk_get_rate(sci_port->dclk); 814 sci_port->port.uartclk = clk_get_rate(sci_port->iclk);
819 815 clk_enable(sci_port->fclk);
820 if (sci_port->iclk)
821 clk_enable(sci_port->iclk);
822} 816}
823 817
824static void sci_clk_disable(struct uart_port *port) 818static void sci_clk_disable(struct uart_port *port)
825{ 819{
826 struct sci_port *sci_port = to_sci_port(port); 820 struct sci_port *sci_port = to_sci_port(port);
827 821
828 if (sci_port->iclk) 822 clk_disable(sci_port->fclk);
829 clk_disable(sci_port->iclk); 823 clk_disable(sci_port->iclk);
830
831 clk_disable(sci_port->dclk);
832} 824}
833 825
834static int sci_request_irq(struct sci_port *port) 826static int sci_request_irq(struct sci_port *port)
@@ -917,22 +909,26 @@ static void sci_dma_tx_complete(void *arg)
917 909
918 spin_lock_irqsave(&port->lock, flags); 910 spin_lock_irqsave(&port->lock, flags);
919 911
920 xmit->tail += s->sg_tx.length; 912 xmit->tail += sg_dma_len(&s->sg_tx);
921 xmit->tail &= UART_XMIT_SIZE - 1; 913 xmit->tail &= UART_XMIT_SIZE - 1;
922 914
923 port->icount.tx += s->sg_tx.length; 915 port->icount.tx += sg_dma_len(&s->sg_tx);
924 916
925 async_tx_ack(s->desc_tx); 917 async_tx_ack(s->desc_tx);
926 s->cookie_tx = -EINVAL; 918 s->cookie_tx = -EINVAL;
927 s->desc_tx = NULL; 919 s->desc_tx = NULL;
928 920
929 spin_unlock_irqrestore(&port->lock, flags);
930
931 if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS) 921 if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
932 uart_write_wakeup(port); 922 uart_write_wakeup(port);
933 923
934 if (uart_circ_chars_pending(xmit)) 924 if (!uart_circ_empty(xmit)) {
935 schedule_work(&s->work_tx); 925 schedule_work(&s->work_tx);
926 } else if (port->type == PORT_SCIFA) {
927 u16 ctrl = sci_in(port, SCSCR);
928 sci_out(port, SCSCR, ctrl & ~SCI_CTRL_FLAGS_TIE);
929 }
930
931 spin_unlock_irqrestore(&port->lock, flags);
936} 932}
937 933
938/* Locking: called with port lock held */ 934/* Locking: called with port lock held */
@@ -976,13 +972,13 @@ static void sci_dma_rx_complete(void *arg)
976 unsigned long flags; 972 unsigned long flags;
977 int count; 973 int count;
978 974
979 dev_dbg(port->dev, "%s(%d)\n", __func__, port->line); 975 dev_dbg(port->dev, "%s(%d) active #%d\n", __func__, port->line, s->active_rx);
980 976
981 spin_lock_irqsave(&port->lock, flags); 977 spin_lock_irqsave(&port->lock, flags);
982 978
983 count = sci_dma_rx_push(s, tty, s->buf_len_rx); 979 count = sci_dma_rx_push(s, tty, s->buf_len_rx);
984 980
985 mod_timer(&s->rx_timer, jiffies + msecs_to_jiffies(5)); 981 mod_timer(&s->rx_timer, jiffies + s->rx_timeout);
986 982
987 spin_unlock_irqrestore(&port->lock, flags); 983 spin_unlock_irqrestore(&port->lock, flags);
988 984
@@ -1054,6 +1050,8 @@ static void sci_submit_rx(struct sci_port *s)
1054 sci_rx_dma_release(s, true); 1050 sci_rx_dma_release(s, true);
1055 return; 1051 return;
1056 } 1052 }
1053 dev_dbg(s->port.dev, "%s(): cookie %d to #%d\n", __func__,
1054 s->cookie_rx[i], i);
1057 } 1055 }
1058 1056
1059 s->active_rx = s->cookie_rx[0]; 1057 s->active_rx = s->cookie_rx[0];
@@ -1111,10 +1109,10 @@ static void work_fn_rx(struct work_struct *work)
1111 return; 1109 return;
1112 } 1110 }
1113 1111
1114 dev_dbg(port->dev, "%s: cookie %d #%d\n", __func__,
1115 s->cookie_rx[new], new);
1116
1117 s->active_rx = s->cookie_rx[!new]; 1112 s->active_rx = s->cookie_rx[!new];
1113
1114 dev_dbg(port->dev, "%s: cookie %d #%d, new active #%d\n", __func__,
1115 s->cookie_rx[new], new, s->active_rx);
1118} 1116}
1119 1117
1120static void work_fn_tx(struct work_struct *work) 1118static void work_fn_tx(struct work_struct *work)
@@ -1135,14 +1133,13 @@ static void work_fn_tx(struct work_struct *work)
1135 */ 1133 */
1136 spin_lock_irq(&port->lock); 1134 spin_lock_irq(&port->lock);
1137 sg->offset = xmit->tail & (UART_XMIT_SIZE - 1); 1135 sg->offset = xmit->tail & (UART_XMIT_SIZE - 1);
1138 sg->dma_address = (sg_dma_address(sg) & ~(UART_XMIT_SIZE - 1)) + 1136 sg_dma_address(sg) = (sg_dma_address(sg) & ~(UART_XMIT_SIZE - 1)) +
1139 sg->offset; 1137 sg->offset;
1140 sg->length = min((int)CIRC_CNT(xmit->head, xmit->tail, UART_XMIT_SIZE), 1138 sg_dma_len(sg) = min((int)CIRC_CNT(xmit->head, xmit->tail, UART_XMIT_SIZE),
1141 CIRC_CNT_TO_END(xmit->head, xmit->tail, UART_XMIT_SIZE)); 1139 CIRC_CNT_TO_END(xmit->head, xmit->tail, UART_XMIT_SIZE));
1142 sg->dma_length = sg->length;
1143 spin_unlock_irq(&port->lock); 1140 spin_unlock_irq(&port->lock);
1144 1141
1145 BUG_ON(!sg->length); 1142 BUG_ON(!sg_dma_len(sg));
1146 1143
1147 desc = chan->device->device_prep_slave_sg(chan, 1144 desc = chan->device->device_prep_slave_sg(chan,
1148 sg, s->sg_len_tx, DMA_TO_DEVICE, 1145 sg, s->sg_len_tx, DMA_TO_DEVICE,
@@ -1177,23 +1174,28 @@ static void work_fn_tx(struct work_struct *work)
1177 1174
1178static void sci_start_tx(struct uart_port *port) 1175static void sci_start_tx(struct uart_port *port)
1179{ 1176{
1177 struct sci_port *s = to_sci_port(port);
1180 unsigned short ctrl; 1178 unsigned short ctrl;
1181 1179
1182#ifdef CONFIG_SERIAL_SH_SCI_DMA 1180#ifdef CONFIG_SERIAL_SH_SCI_DMA
1183 struct sci_port *s = to_sci_port(port); 1181 if (port->type == PORT_SCIFA) {
1184 1182 u16 new, scr = sci_in(port, SCSCR);
1185 if (s->chan_tx) { 1183 if (s->chan_tx)
1186 if (!uart_circ_empty(&s->port.state->xmit) && s->cookie_tx < 0) 1184 new = scr | 0x8000;
1187 schedule_work(&s->work_tx); 1185 else
1188 1186 new = scr & ~0x8000;
1189 return; 1187 if (new != scr)
1188 sci_out(port, SCSCR, new);
1190 } 1189 }
1190 if (s->chan_tx && !uart_circ_empty(&s->port.state->xmit) &&
1191 s->cookie_tx < 0)
1192 schedule_work(&s->work_tx);
1191#endif 1193#endif
1192 1194 if (!s->chan_tx || port->type == PORT_SCIFA) {
1193 /* Set TIE (Transmit Interrupt Enable) bit in SCSCR */ 1195 /* Set TIE (Transmit Interrupt Enable) bit in SCSCR */
1194 ctrl = sci_in(port, SCSCR); 1196 ctrl = sci_in(port, SCSCR);
1195 ctrl |= SCI_CTRL_FLAGS_TIE; 1197 sci_out(port, SCSCR, ctrl | SCI_CTRL_FLAGS_TIE);
1196 sci_out(port, SCSCR, ctrl); 1198 }
1197} 1199}
1198 1200
1199static void sci_stop_tx(struct uart_port *port) 1201static void sci_stop_tx(struct uart_port *port)
@@ -1202,6 +1204,8 @@ static void sci_stop_tx(struct uart_port *port)
1202 1204
1203 /* Clear TIE (Transmit Interrupt Enable) bit in SCSCR */ 1205 /* Clear TIE (Transmit Interrupt Enable) bit in SCSCR */
1204 ctrl = sci_in(port, SCSCR); 1206 ctrl = sci_in(port, SCSCR);
1207 if (port->type == PORT_SCIFA)
1208 ctrl &= ~0x8000;
1205 ctrl &= ~SCI_CTRL_FLAGS_TIE; 1209 ctrl &= ~SCI_CTRL_FLAGS_TIE;
1206 sci_out(port, SCSCR, ctrl); 1210 sci_out(port, SCSCR, ctrl);
1207} 1211}
@@ -1212,6 +1216,8 @@ static void sci_start_rx(struct uart_port *port)
1212 1216
1213 /* Set RIE (Receive Interrupt Enable) bit in SCSCR */ 1217 /* Set RIE (Receive Interrupt Enable) bit in SCSCR */
1214 ctrl |= sci_in(port, SCSCR); 1218 ctrl |= sci_in(port, SCSCR);
1219 if (port->type == PORT_SCIFA)
1220 ctrl &= ~0x4000;
1215 sci_out(port, SCSCR, ctrl); 1221 sci_out(port, SCSCR, ctrl);
1216} 1222}
1217 1223
@@ -1221,6 +1227,8 @@ static void sci_stop_rx(struct uart_port *port)
1221 1227
1222 /* Clear RIE (Receive Interrupt Enable) bit in SCSCR */ 1228 /* Clear RIE (Receive Interrupt Enable) bit in SCSCR */
1223 ctrl = sci_in(port, SCSCR); 1229 ctrl = sci_in(port, SCSCR);
1230 if (port->type == PORT_SCIFA)
1231 ctrl &= ~0x4000;
1224 ctrl &= ~(SCI_CTRL_FLAGS_RIE | SCI_CTRL_FLAGS_REIE); 1232 ctrl &= ~(SCI_CTRL_FLAGS_RIE | SCI_CTRL_FLAGS_REIE);
1225 sci_out(port, SCSCR, ctrl); 1233 sci_out(port, SCSCR, ctrl);
1226} 1234}
@@ -1255,8 +1263,12 @@ static void rx_timer_fn(unsigned long arg)
1255{ 1263{
1256 struct sci_port *s = (struct sci_port *)arg; 1264 struct sci_port *s = (struct sci_port *)arg;
1257 struct uart_port *port = &s->port; 1265 struct uart_port *port = &s->port;
1258
1259 u16 scr = sci_in(port, SCSCR); 1266 u16 scr = sci_in(port, SCSCR);
1267
1268 if (port->type == PORT_SCIFA) {
1269 scr &= ~0x4000;
1270 enable_irq(s->irqs[1]);
1271 }
1260 sci_out(port, SCSCR, scr | SCI_CTRL_FLAGS_RIE); 1272 sci_out(port, SCSCR, scr | SCI_CTRL_FLAGS_RIE);
1261 dev_dbg(port->dev, "DMA Rx timed out\n"); 1273 dev_dbg(port->dev, "DMA Rx timed out\n");
1262 schedule_work(&s->work_rx); 1274 schedule_work(&s->work_rx);
@@ -1343,8 +1355,7 @@ static void sci_request_dma(struct uart_port *port)
1343 sg_init_table(sg, 1); 1355 sg_init_table(sg, 1);
1344 sg_set_page(sg, virt_to_page(buf[i]), s->buf_len_rx, 1356 sg_set_page(sg, virt_to_page(buf[i]), s->buf_len_rx,
1345 (int)buf[i] & ~PAGE_MASK); 1357 (int)buf[i] & ~PAGE_MASK);
1346 sg->dma_address = dma[i]; 1358 sg_dma_address(sg) = dma[i];
1347 sg->dma_length = sg->length;
1348 } 1359 }
1349 1360
1350 INIT_WORK(&s->work_rx, work_fn_rx); 1361 INIT_WORK(&s->work_rx, work_fn_rx);
@@ -1407,8 +1418,12 @@ static void sci_shutdown(struct uart_port *port)
1407static void sci_set_termios(struct uart_port *port, struct ktermios *termios, 1418static void sci_set_termios(struct uart_port *port, struct ktermios *termios,
1408 struct ktermios *old) 1419 struct ktermios *old)
1409{ 1420{
1421#ifdef CONFIG_SERIAL_SH_SCI_DMA
1422 struct sci_port *s = to_sci_port(port);
1423#endif
1410 unsigned int status, baud, smr_val, max_baud; 1424 unsigned int status, baud, smr_val, max_baud;
1411 int t = -1; 1425 int t = -1;
1426 u16 scfcr = 0;
1412 1427
1413 /* 1428 /*
1414 * earlyprintk comes here early on with port->uartclk set to zero. 1429 * earlyprintk comes here early on with port->uartclk set to zero.
@@ -1431,7 +1446,7 @@ static void sci_set_termios(struct uart_port *port, struct ktermios *termios,
1431 sci_out(port, SCSCR, 0x00); /* TE=0, RE=0, CKE1=0 */ 1446 sci_out(port, SCSCR, 0x00); /* TE=0, RE=0, CKE1=0 */
1432 1447
1433 if (port->type != PORT_SCI) 1448 if (port->type != PORT_SCI)
1434 sci_out(port, SCFCR, SCFCR_RFRST | SCFCR_TFRST); 1449 sci_out(port, SCFCR, scfcr | SCFCR_RFRST | SCFCR_TFRST);
1435 1450
1436 smr_val = sci_in(port, SCSMR) & 3; 1451 smr_val = sci_in(port, SCSMR) & 3;
1437 if ((termios->c_cflag & CSIZE) == CS7) 1452 if ((termios->c_cflag & CSIZE) == CS7)
@@ -1462,10 +1477,32 @@ static void sci_set_termios(struct uart_port *port, struct ktermios *termios,
1462 } 1477 }
1463 1478
1464 sci_init_pins(port, termios->c_cflag); 1479 sci_init_pins(port, termios->c_cflag);
1465 sci_out(port, SCFCR, (termios->c_cflag & CRTSCTS) ? SCFCR_MCE : 0); 1480 sci_out(port, SCFCR, scfcr | ((termios->c_cflag & CRTSCTS) ? SCFCR_MCE : 0));
1466 1481
1467 sci_out(port, SCSCR, SCSCR_INIT(port)); 1482 sci_out(port, SCSCR, SCSCR_INIT(port));
1468 1483
1484#ifdef CONFIG_SERIAL_SH_SCI_DMA
1485 /*
1486 * Calculate delay for 1.5 DMA buffers: see
1487 * drivers/serial/serial_core.c::uart_update_timeout(). With 10 bits
1488 * (CS8), 250Hz, 115200 baud and 64 bytes FIFO, the above function
1489 * calculates 1 jiffie for the data plus 5 jiffies for the "slop(e)."
1490 * Then below we calculate 3 jiffies (12ms) for 1.5 DMA buffers (3 FIFO
1491 * sizes), but it has been found out experimentally, that this is not
1492 * enough: the driver too often needlessly runs on a DMA timeout. 20ms
1493 * as a minimum seem to work perfectly.
1494 */
1495 if (s->chan_rx) {
1496 s->rx_timeout = (port->timeout - HZ / 50) * s->buf_len_rx * 3 /
1497 port->fifosize / 2;
1498 dev_dbg(port->dev,
1499 "DMA Rx t-out %ums, tty t-out %u jiffies\n",
1500 s->rx_timeout * 1000 / HZ, port->timeout);
1501 if (s->rx_timeout < msecs_to_jiffies(20))
1502 s->rx_timeout = msecs_to_jiffies(20);
1503 }
1504#endif
1505
1469 if ((termios->c_cflag & CREAD) != 0) 1506 if ((termios->c_cflag & CREAD) != 0)
1470 sci_start_rx(port); 1507 sci_start_rx(port);
1471} 1508}
@@ -1557,10 +1594,10 @@ static struct uart_ops sci_uart_ops = {
1557#endif 1594#endif
1558}; 1595};
1559 1596
1560static void __devinit sci_init_single(struct platform_device *dev, 1597static int __devinit sci_init_single(struct platform_device *dev,
1561 struct sci_port *sci_port, 1598 struct sci_port *sci_port,
1562 unsigned int index, 1599 unsigned int index,
1563 struct plat_sci_port *p) 1600 struct plat_sci_port *p)
1564{ 1601{
1565 struct uart_port *port = &sci_port->port; 1602 struct uart_port *port = &sci_port->port;
1566 1603
@@ -1581,8 +1618,23 @@ static void __devinit sci_init_single(struct platform_device *dev,
1581 } 1618 }
1582 1619
1583 if (dev) { 1620 if (dev) {
1584 sci_port->iclk = p->clk ? clk_get(&dev->dev, p->clk) : NULL; 1621 sci_port->iclk = clk_get(&dev->dev, "sci_ick");
1585 sci_port->dclk = clk_get(&dev->dev, "peripheral_clk"); 1622 if (IS_ERR(sci_port->iclk)) {
1623 sci_port->iclk = clk_get(&dev->dev, "peripheral_clk");
1624 if (IS_ERR(sci_port->iclk)) {
1625 dev_err(&dev->dev, "can't get iclk\n");
1626 return PTR_ERR(sci_port->iclk);
1627 }
1628 }
1629
1630 /*
1631 * The function clock is optional, ignore it if we can't
1632 * find it.
1633 */
1634 sci_port->fclk = clk_get(&dev->dev, "sci_fck");
1635 if (IS_ERR(sci_port->fclk))
1636 sci_port->fclk = NULL;
1637
1586 sci_port->enable = sci_clk_enable; 1638 sci_port->enable = sci_clk_enable;
1587 sci_port->disable = sci_clk_disable; 1639 sci_port->disable = sci_clk_disable;
1588 port->dev = &dev->dev; 1640 port->dev = &dev->dev;
@@ -1609,6 +1661,7 @@ static void __devinit sci_init_single(struct platform_device *dev,
1609#endif 1661#endif
1610 1662
1611 memcpy(&sci_port->irqs, &p->irqs, sizeof(p->irqs)); 1663 memcpy(&sci_port->irqs, &p->irqs, sizeof(p->irqs));
1664 return 0;
1612} 1665}
1613 1666
1614#ifdef CONFIG_SERIAL_SH_SCI_CONSOLE 1667#ifdef CONFIG_SERIAL_SH_SCI_CONSOLE
@@ -1758,8 +1811,11 @@ static int sci_remove(struct platform_device *dev)
1758 cpufreq_unregister_notifier(&priv->clk_nb, CPUFREQ_TRANSITION_NOTIFIER); 1811 cpufreq_unregister_notifier(&priv->clk_nb, CPUFREQ_TRANSITION_NOTIFIER);
1759 1812
1760 spin_lock_irqsave(&priv->lock, flags); 1813 spin_lock_irqsave(&priv->lock, flags);
1761 list_for_each_entry(p, &priv->ports, node) 1814 list_for_each_entry(p, &priv->ports, node) {
1762 uart_remove_one_port(&sci_uart_driver, &p->port); 1815 uart_remove_one_port(&sci_uart_driver, &p->port);
1816 clk_put(p->iclk);
1817 clk_put(p->fclk);
1818 }
1763 spin_unlock_irqrestore(&priv->lock, flags); 1819 spin_unlock_irqrestore(&priv->lock, flags);
1764 1820
1765 kfree(priv); 1821 kfree(priv);
@@ -1785,7 +1841,9 @@ static int __devinit sci_probe_single(struct platform_device *dev,
1785 return 0; 1841 return 0;
1786 } 1842 }
1787 1843
1788 sci_init_single(dev, sciport, index, p); 1844 ret = sci_init_single(dev, sciport, index, p);
1845 if (ret)
1846 return ret;
1789 1847
1790 ret = uart_add_one_port(&sci_uart_driver, &sciport->port); 1848 ret = uart_add_one_port(&sci_uart_driver, &sciport->port);
1791 if (ret) 1849 if (ret)
diff --git a/drivers/serial/sh-sci.h b/drivers/serial/sh-sci.h
index fad67d33b0bd..f70c49f915fa 100644
--- a/drivers/serial/sh-sci.h
+++ b/drivers/serial/sh-sci.h
@@ -31,7 +31,9 @@
31# define SCSCR_INIT(port) (port->mapbase == SCIF2) ? 0xF3 : 0xF0 31# define SCSCR_INIT(port) (port->mapbase == SCIF2) ? 0xF3 : 0xF0
32#elif defined(CONFIG_CPU_SUBTYPE_SH7720) || \ 32#elif defined(CONFIG_CPU_SUBTYPE_SH7720) || \
33 defined(CONFIG_CPU_SUBTYPE_SH7721) || \ 33 defined(CONFIG_CPU_SUBTYPE_SH7721) || \
34 defined(CONFIG_ARCH_SHMOBILE) 34 defined(CONFIG_ARCH_SH7367) || \
35 defined(CONFIG_ARCH_SH7377) || \
36 defined(CONFIG_ARCH_SH7372)
35# define SCSCR_INIT(port) 0x0030 /* TIE=0,RIE=0,TE=1,RE=1 */ 37# define SCSCR_INIT(port) 0x0030 /* TIE=0,RIE=0,TE=1,RE=1 */
36# define PORT_PTCR 0xA405011EUL 38# define PORT_PTCR 0xA405011EUL
37# define PORT_PVCR 0xA4050122UL 39# define PORT_PVCR 0xA4050122UL
@@ -94,7 +96,9 @@
94# define SCSCR_INIT(port) 0x0038 /* TIE=0,RIE=0,TE=1,RE=1,REIE=1 */ 96# define SCSCR_INIT(port) 0x0038 /* TIE=0,RIE=0,TE=1,RE=1,REIE=1 */
95#elif defined(CONFIG_CPU_SUBTYPE_SH7724) 97#elif defined(CONFIG_CPU_SUBTYPE_SH7724)
96# define SCIF_ORER 0x0001 /* overrun error bit */ 98# define SCIF_ORER 0x0001 /* overrun error bit */
97# define SCSCR_INIT(port) 0x0038 /* TIE=0,RIE=0,TE=1,RE=1,REIE=1 */ 99# define SCSCR_INIT(port) ((port)->type == PORT_SCIFA ? \
100 0x30 /* TIE=0,RIE=0,TE=1,RE=1 */ : \
101 0x38 /* TIE=0,RIE=0,TE=1,RE=1,REIE=1 */ )
98#elif defined(CONFIG_CPU_SUBTYPE_SH4_202) 102#elif defined(CONFIG_CPU_SUBTYPE_SH4_202)
99# define SCSPTR2 0xffe80020 /* 16 bit SCIF */ 103# define SCSPTR2 0xffe80020 /* 16 bit SCIF */
100# define SCIF_ORER 0x0001 /* overrun error bit */ 104# define SCIF_ORER 0x0001 /* overrun error bit */
@@ -197,6 +201,8 @@
197 defined(CONFIG_CPU_SUBTYPE_SH7786) || \ 201 defined(CONFIG_CPU_SUBTYPE_SH7786) || \
198 defined(CONFIG_CPU_SUBTYPE_SHX3) 202 defined(CONFIG_CPU_SUBTYPE_SHX3)
199#define SCI_CTRL_FLAGS_REIE 0x08 /* 7750 SCIF */ 203#define SCI_CTRL_FLAGS_REIE 0x08 /* 7750 SCIF */
204#elif defined(CONFIG_CPU_SUBTYPE_SH7724)
205#define SCI_CTRL_FLAGS_REIE ((port)->type == PORT_SCIFA ? 0 : 8)
200#else 206#else
201#define SCI_CTRL_FLAGS_REIE 0 207#define SCI_CTRL_FLAGS_REIE 0
202#endif 208#endif
@@ -230,7 +236,9 @@
230#if defined(CONFIG_CPU_SUBTYPE_SH7705) || \ 236#if defined(CONFIG_CPU_SUBTYPE_SH7705) || \
231 defined(CONFIG_CPU_SUBTYPE_SH7720) || \ 237 defined(CONFIG_CPU_SUBTYPE_SH7720) || \
232 defined(CONFIG_CPU_SUBTYPE_SH7721) || \ 238 defined(CONFIG_CPU_SUBTYPE_SH7721) || \
233 defined(CONFIG_ARCH_SHMOBILE) 239 defined(CONFIG_ARCH_SH7367) || \
240 defined(CONFIG_ARCH_SH7377) || \
241 defined(CONFIG_ARCH_SH7372)
234# define SCIF_ORER 0x0200 242# define SCIF_ORER 0x0200
235# define SCIF_ERRORS ( SCIF_PER | SCIF_FER | SCIF_ER | SCIF_BRK | SCIF_ORER) 243# define SCIF_ERRORS ( SCIF_PER | SCIF_FER | SCIF_ER | SCIF_BRK | SCIF_ORER)
236# define SCIF_RFDC_MASK 0x007f 244# define SCIF_RFDC_MASK 0x007f
@@ -264,7 +272,9 @@
264#if defined(CONFIG_CPU_SUBTYPE_SH7705) || \ 272#if defined(CONFIG_CPU_SUBTYPE_SH7705) || \
265 defined(CONFIG_CPU_SUBTYPE_SH7720) || \ 273 defined(CONFIG_CPU_SUBTYPE_SH7720) || \
266 defined(CONFIG_CPU_SUBTYPE_SH7721) || \ 274 defined(CONFIG_CPU_SUBTYPE_SH7721) || \
267 defined(CONFIG_ARCH_SHMOBILE) 275 defined(CONFIG_ARCH_SH7367) || \
276 defined(CONFIG_ARCH_SH7377) || \
277 defined(CONFIG_ARCH_SH7372)
268# define SCxSR_RDxF_CLEAR(port) (sci_in(port, SCxSR) & 0xfffc) 278# define SCxSR_RDxF_CLEAR(port) (sci_in(port, SCxSR) & 0xfffc)
269# define SCxSR_ERROR_CLEAR(port) (sci_in(port, SCxSR) & 0xfd73) 279# define SCxSR_ERROR_CLEAR(port) (sci_in(port, SCxSR) & 0xfd73)
270# define SCxSR_TDxE_CLEAR(port) (sci_in(port, SCxSR) & 0xffdf) 280# define SCxSR_TDxE_CLEAR(port) (sci_in(port, SCxSR) & 0xffdf)
@@ -359,7 +369,10 @@
359 SCI_OUT(sci_size, sci_offset, value); \ 369 SCI_OUT(sci_size, sci_offset, value); \
360 } 370 }
361 371
362#if defined(CONFIG_CPU_SH3) || defined(CONFIG_ARCH_SHMOBILE) 372#if defined(CONFIG_CPU_SH3) || \
373 defined(CONFIG_ARCH_SH7367) || \
374 defined(CONFIG_ARCH_SH7377) || \
375 defined(CONFIG_ARCH_SH7372)
363#if defined(CONFIG_CPU_SUBTYPE_SH7710) || defined(CONFIG_CPU_SUBTYPE_SH7712) 376#if defined(CONFIG_CPU_SUBTYPE_SH7710) || defined(CONFIG_CPU_SUBTYPE_SH7712)
364#define SCIx_FNS(name, sh3_sci_offset, sh3_sci_size, sh4_sci_offset, sh4_sci_size, \ 377#define SCIx_FNS(name, sh3_sci_offset, sh3_sci_size, sh4_sci_offset, sh4_sci_size, \
365 sh3_scif_offset, sh3_scif_size, sh4_scif_offset, sh4_scif_size, \ 378 sh3_scif_offset, sh3_scif_size, sh4_scif_offset, sh4_scif_size, \
@@ -370,7 +383,9 @@
370#elif defined(CONFIG_CPU_SUBTYPE_SH7705) || \ 383#elif defined(CONFIG_CPU_SUBTYPE_SH7705) || \
371 defined(CONFIG_CPU_SUBTYPE_SH7720) || \ 384 defined(CONFIG_CPU_SUBTYPE_SH7720) || \
372 defined(CONFIG_CPU_SUBTYPE_SH7721) || \ 385 defined(CONFIG_CPU_SUBTYPE_SH7721) || \
373 defined(CONFIG_ARCH_SHMOBILE) 386 defined(CONFIG_ARCH_SH7367) || \
387 defined(CONFIG_ARCH_SH7377) || \
388 defined(CONFIG_ARCH_SH7372)
374#define SCIF_FNS(name, scif_offset, scif_size) \ 389#define SCIF_FNS(name, scif_offset, scif_size) \
375 CPU_SCIF_FNS(name, scif_offset, scif_size) 390 CPU_SCIF_FNS(name, scif_offset, scif_size)
376#else 391#else
@@ -406,7 +421,9 @@
406#if defined(CONFIG_CPU_SUBTYPE_SH7705) || \ 421#if defined(CONFIG_CPU_SUBTYPE_SH7705) || \
407 defined(CONFIG_CPU_SUBTYPE_SH7720) || \ 422 defined(CONFIG_CPU_SUBTYPE_SH7720) || \
408 defined(CONFIG_CPU_SUBTYPE_SH7721) || \ 423 defined(CONFIG_CPU_SUBTYPE_SH7721) || \
409 defined(CONFIG_ARCH_SHMOBILE) 424 defined(CONFIG_ARCH_SH7367) || \
425 defined(CONFIG_ARCH_SH7377) || \
426 defined(CONFIG_ARCH_SH7372)
410 427
411SCIF_FNS(SCSMR, 0x00, 16) 428SCIF_FNS(SCSMR, 0x00, 16)
412SCIF_FNS(SCBRR, 0x04, 8) 429SCIF_FNS(SCBRR, 0x04, 8)
@@ -589,7 +606,9 @@ static inline int sci_rxd_in(struct uart_port *port)
589#elif defined(CONFIG_CPU_SUBTYPE_SH7705) || \ 606#elif defined(CONFIG_CPU_SUBTYPE_SH7705) || \
590 defined(CONFIG_CPU_SUBTYPE_SH7720) || \ 607 defined(CONFIG_CPU_SUBTYPE_SH7720) || \
591 defined(CONFIG_CPU_SUBTYPE_SH7721) || \ 608 defined(CONFIG_CPU_SUBTYPE_SH7721) || \
592 defined(CONFIG_ARCH_SHMOBILE) 609 defined(CONFIG_ARCH_SH7367) || \
610 defined(CONFIG_ARCH_SH7377) || \
611 defined(CONFIG_ARCH_SH7372)
593#define SCBRR_VALUE(bps, clk) (((clk*2)+16*bps)/(32*bps)-1) 612#define SCBRR_VALUE(bps, clk) (((clk*2)+16*bps)/(32*bps)-1)
594#elif defined(CONFIG_CPU_SUBTYPE_SH7723) ||\ 613#elif defined(CONFIG_CPU_SUBTYPE_SH7723) ||\
595 defined(CONFIG_CPU_SUBTYPE_SH7724) 614 defined(CONFIG_CPU_SUBTYPE_SH7724)
diff --git a/drivers/sh/intc.c b/drivers/sh/intc.c
index a700dfec8dc3..f43850527645 100644
--- a/drivers/sh/intc.c
+++ b/drivers/sh/intc.c
@@ -2,7 +2,7 @@
2 * Shared interrupt handling code for IPR and INTC2 types of IRQs. 2 * Shared interrupt handling code for IPR and INTC2 types of IRQs.
3 * 3 *
4 * Copyright (C) 2007, 2008 Magnus Damm 4 * Copyright (C) 2007, 2008 Magnus Damm
5 * Copyright (C) 2009 Paul Mundt 5 * Copyright (C) 2009, 2010 Paul Mundt
6 * 6 *
7 * Based on intc2.c and ipr.c 7 * Based on intc2.c and ipr.c
8 * 8 *
@@ -26,6 +26,7 @@
26#include <linux/list.h> 26#include <linux/list.h>
27#include <linux/topology.h> 27#include <linux/topology.h>
28#include <linux/bitmap.h> 28#include <linux/bitmap.h>
29#include <linux/cpumask.h>
29 30
30#define _INTC_MK(fn, mode, addr_e, addr_d, width, shift) \ 31#define _INTC_MK(fn, mode, addr_e, addr_d, width, shift) \
31 ((shift) | ((width) << 5) | ((fn) << 9) | ((mode) << 13) | \ 32 ((shift) | ((width) << 5) | ((fn) << 9) | ((mode) << 13) | \
@@ -242,6 +243,10 @@ static inline void _intc_enable(unsigned int irq, unsigned long handle)
242 unsigned int cpu; 243 unsigned int cpu;
243 244
244 for (cpu = 0; cpu < SMP_NR(d, _INTC_ADDR_E(handle)); cpu++) { 245 for (cpu = 0; cpu < SMP_NR(d, _INTC_ADDR_E(handle)); cpu++) {
246#ifdef CONFIG_SMP
247 if (!cpumask_test_cpu(cpu, irq_to_desc(irq)->affinity))
248 continue;
249#endif
245 addr = INTC_REG(d, _INTC_ADDR_E(handle), cpu); 250 addr = INTC_REG(d, _INTC_ADDR_E(handle), cpu);
246 intc_enable_fns[_INTC_MODE(handle)](addr, handle, intc_reg_fns\ 251 intc_enable_fns[_INTC_MODE(handle)](addr, handle, intc_reg_fns\
247 [_INTC_FN(handle)], irq); 252 [_INTC_FN(handle)], irq);
@@ -261,6 +266,10 @@ static void intc_disable(unsigned int irq)
261 unsigned int cpu; 266 unsigned int cpu;
262 267
263 for (cpu = 0; cpu < SMP_NR(d, _INTC_ADDR_D(handle)); cpu++) { 268 for (cpu = 0; cpu < SMP_NR(d, _INTC_ADDR_D(handle)); cpu++) {
269#ifdef CONFIG_SMP
270 if (!cpumask_test_cpu(cpu, irq_to_desc(irq)->affinity))
271 continue;
272#endif
264 addr = INTC_REG(d, _INTC_ADDR_D(handle), cpu); 273 addr = INTC_REG(d, _INTC_ADDR_D(handle), cpu);
265 intc_disable_fns[_INTC_MODE(handle)](addr, handle,intc_reg_fns\ 274 intc_disable_fns[_INTC_MODE(handle)](addr, handle,intc_reg_fns\
266 [_INTC_FN(handle)], irq); 275 [_INTC_FN(handle)], irq);
@@ -309,6 +318,23 @@ static int intc_set_wake(unsigned int irq, unsigned int on)
309 return 0; /* allow wakeup, but setup hardware in intc_suspend() */ 318 return 0; /* allow wakeup, but setup hardware in intc_suspend() */
310} 319}
311 320
321#ifdef CONFIG_SMP
322/*
323 * This is held with the irq desc lock held, so we don't require any
324 * additional locking here at the intc desc level. The affinity mask is
325 * later tested in the enable/disable paths.
326 */
327static int intc_set_affinity(unsigned int irq, const struct cpumask *cpumask)
328{
329 if (!cpumask_intersects(cpumask, cpu_online_mask))
330 return -1;
331
332 cpumask_copy(irq_to_desc(irq)->affinity, cpumask);
333
334 return 0;
335}
336#endif
337
312static void intc_mask_ack(unsigned int irq) 338static void intc_mask_ack(unsigned int irq)
313{ 339{
314 struct intc_desc_int *d = get_intc_desc(irq); 340 struct intc_desc_int *d = get_intc_desc(irq);
@@ -916,6 +942,9 @@ int __init register_intc_controller(struct intc_desc *desc)
916 d->chip.shutdown = intc_disable; 942 d->chip.shutdown = intc_disable;
917 d->chip.set_type = intc_set_sense; 943 d->chip.set_type = intc_set_sense;
918 d->chip.set_wake = intc_set_wake; 944 d->chip.set_wake = intc_set_wake;
945#ifdef CONFIG_SMP
946 d->chip.set_affinity = intc_set_affinity;
947#endif
919 948
920 if (hw->ack_regs) { 949 if (hw->ack_regs) {
921 for (i = 0; i < hw->nr_ack_regs; i++) 950 for (i = 0; i < hw->nr_ack_regs; i++)
diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c
index 975d556b4787..be6331e2c276 100644
--- a/drivers/usb/class/cdc-acm.c
+++ b/drivers/usb/class/cdc-acm.c
@@ -1441,7 +1441,7 @@ static int acm_resume(struct usb_interface *intf)
1441 wb = acm->delayed_wb; 1441 wb = acm->delayed_wb;
1442 acm->delayed_wb = NULL; 1442 acm->delayed_wb = NULL;
1443 spin_unlock_irq(&acm->write_lock); 1443 spin_unlock_irq(&acm->write_lock);
1444 acm_start_wb(acm, acm->delayed_wb); 1444 acm_start_wb(acm, wb);
1445 } else { 1445 } else {
1446 spin_unlock_irq(&acm->write_lock); 1446 spin_unlock_irq(&acm->write_lock);
1447 } 1447 }
diff --git a/drivers/usb/class/cdc-wdm.c b/drivers/usb/class/cdc-wdm.c
index 18aafcb08fc8..189141ca4e05 100644
--- a/drivers/usb/class/cdc-wdm.c
+++ b/drivers/usb/class/cdc-wdm.c
@@ -52,7 +52,8 @@ MODULE_DEVICE_TABLE (usb, wdm_ids);
52#define WDM_READ 4 52#define WDM_READ 4
53#define WDM_INT_STALL 5 53#define WDM_INT_STALL 5
54#define WDM_POLL_RUNNING 6 54#define WDM_POLL_RUNNING 6
55 55#define WDM_RESPONDING 7
56#define WDM_SUSPENDING 8
56 57
57#define WDM_MAX 16 58#define WDM_MAX 16
58 59
@@ -87,9 +88,7 @@ struct wdm_device {
87 int count; 88 int count;
88 dma_addr_t shandle; 89 dma_addr_t shandle;
89 dma_addr_t ihandle; 90 dma_addr_t ihandle;
90 struct mutex wlock; 91 struct mutex lock;
91 struct mutex rlock;
92 struct mutex plock;
93 wait_queue_head_t wait; 92 wait_queue_head_t wait;
94 struct work_struct rxwork; 93 struct work_struct rxwork;
95 int werr; 94 int werr;
@@ -117,21 +116,22 @@ static void wdm_in_callback(struct urb *urb)
117 int status = urb->status; 116 int status = urb->status;
118 117
119 spin_lock(&desc->iuspin); 118 spin_lock(&desc->iuspin);
119 clear_bit(WDM_RESPONDING, &desc->flags);
120 120
121 if (status) { 121 if (status) {
122 switch (status) { 122 switch (status) {
123 case -ENOENT: 123 case -ENOENT:
124 dev_dbg(&desc->intf->dev, 124 dev_dbg(&desc->intf->dev,
125 "nonzero urb status received: -ENOENT"); 125 "nonzero urb status received: -ENOENT");
126 break; 126 goto skip_error;
127 case -ECONNRESET: 127 case -ECONNRESET:
128 dev_dbg(&desc->intf->dev, 128 dev_dbg(&desc->intf->dev,
129 "nonzero urb status received: -ECONNRESET"); 129 "nonzero urb status received: -ECONNRESET");
130 break; 130 goto skip_error;
131 case -ESHUTDOWN: 131 case -ESHUTDOWN:
132 dev_dbg(&desc->intf->dev, 132 dev_dbg(&desc->intf->dev,
133 "nonzero urb status received: -ESHUTDOWN"); 133 "nonzero urb status received: -ESHUTDOWN");
134 break; 134 goto skip_error;
135 case -EPIPE: 135 case -EPIPE:
136 dev_err(&desc->intf->dev, 136 dev_err(&desc->intf->dev,
137 "nonzero urb status received: -EPIPE\n"); 137 "nonzero urb status received: -EPIPE\n");
@@ -147,6 +147,7 @@ static void wdm_in_callback(struct urb *urb)
147 desc->reslength = urb->actual_length; 147 desc->reslength = urb->actual_length;
148 memmove(desc->ubuf + desc->length, desc->inbuf, desc->reslength); 148 memmove(desc->ubuf + desc->length, desc->inbuf, desc->reslength);
149 desc->length += desc->reslength; 149 desc->length += desc->reslength;
150skip_error:
150 wake_up(&desc->wait); 151 wake_up(&desc->wait);
151 152
152 set_bit(WDM_READ, &desc->flags); 153 set_bit(WDM_READ, &desc->flags);
@@ -229,13 +230,16 @@ static void wdm_int_callback(struct urb *urb)
229 desc->response->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; 230 desc->response->transfer_flags |= URB_NO_TRANSFER_DMA_MAP;
230 spin_lock(&desc->iuspin); 231 spin_lock(&desc->iuspin);
231 clear_bit(WDM_READ, &desc->flags); 232 clear_bit(WDM_READ, &desc->flags);
232 if (!test_bit(WDM_DISCONNECTING, &desc->flags)) { 233 set_bit(WDM_RESPONDING, &desc->flags);
234 if (!test_bit(WDM_DISCONNECTING, &desc->flags)
235 && !test_bit(WDM_SUSPENDING, &desc->flags)) {
233 rv = usb_submit_urb(desc->response, GFP_ATOMIC); 236 rv = usb_submit_urb(desc->response, GFP_ATOMIC);
234 dev_dbg(&desc->intf->dev, "%s: usb_submit_urb %d", 237 dev_dbg(&desc->intf->dev, "%s: usb_submit_urb %d",
235 __func__, rv); 238 __func__, rv);
236 } 239 }
237 spin_unlock(&desc->iuspin); 240 spin_unlock(&desc->iuspin);
238 if (rv < 0) { 241 if (rv < 0) {
242 clear_bit(WDM_RESPONDING, &desc->flags);
239 if (rv == -EPERM) 243 if (rv == -EPERM)
240 return; 244 return;
241 if (rv == -ENOMEM) { 245 if (rv == -ENOMEM) {
@@ -305,14 +309,38 @@ static ssize_t wdm_write
305 if (we < 0) 309 if (we < 0)
306 return -EIO; 310 return -EIO;
307 311
308 r = mutex_lock_interruptible(&desc->wlock); /* concurrent writes */ 312 desc->outbuf = buf = kmalloc(count, GFP_KERNEL);
313 if (!buf) {
314 rv = -ENOMEM;
315 goto outnl;
316 }
317
318 r = copy_from_user(buf, buffer, count);
319 if (r > 0) {
320 kfree(buf);
321 rv = -EFAULT;
322 goto outnl;
323 }
324
325 /* concurrent writes and disconnect */
326 r = mutex_lock_interruptible(&desc->lock);
309 rv = -ERESTARTSYS; 327 rv = -ERESTARTSYS;
310 if (r) 328 if (r) {
329 kfree(buf);
311 goto outnl; 330 goto outnl;
331 }
332
333 if (test_bit(WDM_DISCONNECTING, &desc->flags)) {
334 kfree(buf);
335 rv = -ENODEV;
336 goto outnp;
337 }
312 338
313 r = usb_autopm_get_interface(desc->intf); 339 r = usb_autopm_get_interface(desc->intf);
314 if (r < 0) 340 if (r < 0) {
341 kfree(buf);
315 goto outnp; 342 goto outnp;
343 }
316 344
317 if (!file->f_flags && O_NONBLOCK) 345 if (!file->f_flags && O_NONBLOCK)
318 r = wait_event_interruptible(desc->wait, !test_bit(WDM_IN_USE, 346 r = wait_event_interruptible(desc->wait, !test_bit(WDM_IN_USE,
@@ -320,24 +348,8 @@ static ssize_t wdm_write
320 else 348 else
321 if (test_bit(WDM_IN_USE, &desc->flags)) 349 if (test_bit(WDM_IN_USE, &desc->flags))
322 r = -EAGAIN; 350 r = -EAGAIN;
323 if (r < 0) 351 if (r < 0) {
324 goto out;
325
326 if (test_bit(WDM_DISCONNECTING, &desc->flags)) {
327 rv = -ENODEV;
328 goto out;
329 }
330
331 desc->outbuf = buf = kmalloc(count, GFP_KERNEL);
332 if (!buf) {
333 rv = -ENOMEM;
334 goto out;
335 }
336
337 r = copy_from_user(buf, buffer, count);
338 if (r > 0) {
339 kfree(buf); 352 kfree(buf);
340 rv = -EFAULT;
341 goto out; 353 goto out;
342 } 354 }
343 355
@@ -374,7 +386,7 @@ static ssize_t wdm_write
374out: 386out:
375 usb_autopm_put_interface(desc->intf); 387 usb_autopm_put_interface(desc->intf);
376outnp: 388outnp:
377 mutex_unlock(&desc->wlock); 389 mutex_unlock(&desc->lock);
378outnl: 390outnl:
379 return rv < 0 ? rv : count; 391 return rv < 0 ? rv : count;
380} 392}
@@ -387,7 +399,7 @@ static ssize_t wdm_read
387 struct wdm_device *desc = file->private_data; 399 struct wdm_device *desc = file->private_data;
388 400
389 401
390 rv = mutex_lock_interruptible(&desc->rlock); /*concurrent reads */ 402 rv = mutex_lock_interruptible(&desc->lock); /*concurrent reads */
391 if (rv < 0) 403 if (rv < 0)
392 return -ERESTARTSYS; 404 return -ERESTARTSYS;
393 405
@@ -424,11 +436,8 @@ retry:
424 spin_lock_irq(&desc->iuspin); 436 spin_lock_irq(&desc->iuspin);
425 437
426 if (desc->rerr) { /* read completed, error happened */ 438 if (desc->rerr) { /* read completed, error happened */
427 int t = desc->rerr;
428 desc->rerr = 0; 439 desc->rerr = 0;
429 spin_unlock_irq(&desc->iuspin); 440 spin_unlock_irq(&desc->iuspin);
430 dev_err(&desc->intf->dev,
431 "reading had resulted in %d\n", t);
432 rv = -EIO; 441 rv = -EIO;
433 goto err; 442 goto err;
434 } 443 }
@@ -465,9 +474,7 @@ retry:
465 rv = cntr; 474 rv = cntr;
466 475
467err: 476err:
468 mutex_unlock(&desc->rlock); 477 mutex_unlock(&desc->lock);
469 if (rv < 0 && rv != -EAGAIN)
470 dev_err(&desc->intf->dev, "wdm_read: exit error\n");
471 return rv; 478 return rv;
472} 479}
473 480
@@ -533,7 +540,7 @@ static int wdm_open(struct inode *inode, struct file *file)
533 } 540 }
534 intf->needs_remote_wakeup = 1; 541 intf->needs_remote_wakeup = 1;
535 542
536 mutex_lock(&desc->plock); 543 mutex_lock(&desc->lock);
537 if (!desc->count++) { 544 if (!desc->count++) {
538 rv = usb_submit_urb(desc->validity, GFP_KERNEL); 545 rv = usb_submit_urb(desc->validity, GFP_KERNEL);
539 if (rv < 0) { 546 if (rv < 0) {
@@ -544,7 +551,7 @@ static int wdm_open(struct inode *inode, struct file *file)
544 } else { 551 } else {
545 rv = 0; 552 rv = 0;
546 } 553 }
547 mutex_unlock(&desc->plock); 554 mutex_unlock(&desc->lock);
548 usb_autopm_put_interface(desc->intf); 555 usb_autopm_put_interface(desc->intf);
549out: 556out:
550 mutex_unlock(&wdm_mutex); 557 mutex_unlock(&wdm_mutex);
@@ -556,9 +563,9 @@ static int wdm_release(struct inode *inode, struct file *file)
556 struct wdm_device *desc = file->private_data; 563 struct wdm_device *desc = file->private_data;
557 564
558 mutex_lock(&wdm_mutex); 565 mutex_lock(&wdm_mutex);
559 mutex_lock(&desc->plock); 566 mutex_lock(&desc->lock);
560 desc->count--; 567 desc->count--;
561 mutex_unlock(&desc->plock); 568 mutex_unlock(&desc->lock);
562 569
563 if (!desc->count) { 570 if (!desc->count) {
564 dev_dbg(&desc->intf->dev, "wdm_release: cleanup"); 571 dev_dbg(&desc->intf->dev, "wdm_release: cleanup");
@@ -655,9 +662,7 @@ next_desc:
655 desc = kzalloc(sizeof(struct wdm_device), GFP_KERNEL); 662 desc = kzalloc(sizeof(struct wdm_device), GFP_KERNEL);
656 if (!desc) 663 if (!desc)
657 goto out; 664 goto out;
658 mutex_init(&desc->wlock); 665 mutex_init(&desc->lock);
659 mutex_init(&desc->rlock);
660 mutex_init(&desc->plock);
661 spin_lock_init(&desc->iuspin); 666 spin_lock_init(&desc->iuspin);
662 init_waitqueue_head(&desc->wait); 667 init_waitqueue_head(&desc->wait);
663 desc->wMaxCommand = maxcom; 668 desc->wMaxCommand = maxcom;
@@ -771,14 +776,17 @@ static void wdm_disconnect(struct usb_interface *intf)
771 /* to terminate pending flushes */ 776 /* to terminate pending flushes */
772 clear_bit(WDM_IN_USE, &desc->flags); 777 clear_bit(WDM_IN_USE, &desc->flags);
773 spin_unlock_irqrestore(&desc->iuspin, flags); 778 spin_unlock_irqrestore(&desc->iuspin, flags);
774 cancel_work_sync(&desc->rxwork); 779 mutex_lock(&desc->lock);
775 kill_urbs(desc); 780 kill_urbs(desc);
781 cancel_work_sync(&desc->rxwork);
782 mutex_unlock(&desc->lock);
776 wake_up_all(&desc->wait); 783 wake_up_all(&desc->wait);
777 if (!desc->count) 784 if (!desc->count)
778 cleanup(desc); 785 cleanup(desc);
779 mutex_unlock(&wdm_mutex); 786 mutex_unlock(&wdm_mutex);
780} 787}
781 788
789#ifdef CONFIG_PM
782static int wdm_suspend(struct usb_interface *intf, pm_message_t message) 790static int wdm_suspend(struct usb_interface *intf, pm_message_t message)
783{ 791{
784 struct wdm_device *desc = usb_get_intfdata(intf); 792 struct wdm_device *desc = usb_get_intfdata(intf);
@@ -786,22 +794,30 @@ static int wdm_suspend(struct usb_interface *intf, pm_message_t message)
786 794
787 dev_dbg(&desc->intf->dev, "wdm%d_suspend\n", intf->minor); 795 dev_dbg(&desc->intf->dev, "wdm%d_suspend\n", intf->minor);
788 796
789 mutex_lock(&desc->plock); 797 /* if this is an autosuspend the caller does the locking */
790#ifdef CONFIG_PM 798 if (!(message.event & PM_EVENT_AUTO))
799 mutex_lock(&desc->lock);
800 spin_lock_irq(&desc->iuspin);
801
791 if ((message.event & PM_EVENT_AUTO) && 802 if ((message.event & PM_EVENT_AUTO) &&
792 test_bit(WDM_IN_USE, &desc->flags)) { 803 (test_bit(WDM_IN_USE, &desc->flags)
804 || test_bit(WDM_RESPONDING, &desc->flags))) {
805 spin_unlock_irq(&desc->iuspin);
793 rv = -EBUSY; 806 rv = -EBUSY;
794 } else { 807 } else {
795#endif 808
796 cancel_work_sync(&desc->rxwork); 809 set_bit(WDM_SUSPENDING, &desc->flags);
810 spin_unlock_irq(&desc->iuspin);
811 /* callback submits work - order is essential */
797 kill_urbs(desc); 812 kill_urbs(desc);
798#ifdef CONFIG_PM 813 cancel_work_sync(&desc->rxwork);
799 } 814 }
800#endif 815 if (!(message.event & PM_EVENT_AUTO))
801 mutex_unlock(&desc->plock); 816 mutex_unlock(&desc->lock);
802 817
803 return rv; 818 return rv;
804} 819}
820#endif
805 821
806static int recover_from_urb_loss(struct wdm_device *desc) 822static int recover_from_urb_loss(struct wdm_device *desc)
807{ 823{
@@ -815,23 +831,27 @@ static int recover_from_urb_loss(struct wdm_device *desc)
815 } 831 }
816 return rv; 832 return rv;
817} 833}
834
835#ifdef CONFIG_PM
818static int wdm_resume(struct usb_interface *intf) 836static int wdm_resume(struct usb_interface *intf)
819{ 837{
820 struct wdm_device *desc = usb_get_intfdata(intf); 838 struct wdm_device *desc = usb_get_intfdata(intf);
821 int rv; 839 int rv;
822 840
823 dev_dbg(&desc->intf->dev, "wdm%d_resume\n", intf->minor); 841 dev_dbg(&desc->intf->dev, "wdm%d_resume\n", intf->minor);
824 mutex_lock(&desc->plock); 842
843 clear_bit(WDM_SUSPENDING, &desc->flags);
825 rv = recover_from_urb_loss(desc); 844 rv = recover_from_urb_loss(desc);
826 mutex_unlock(&desc->plock); 845
827 return rv; 846 return rv;
828} 847}
848#endif
829 849
830static int wdm_pre_reset(struct usb_interface *intf) 850static int wdm_pre_reset(struct usb_interface *intf)
831{ 851{
832 struct wdm_device *desc = usb_get_intfdata(intf); 852 struct wdm_device *desc = usb_get_intfdata(intf);
833 853
834 mutex_lock(&desc->plock); 854 mutex_lock(&desc->lock);
835 return 0; 855 return 0;
836} 856}
837 857
@@ -841,7 +861,7 @@ static int wdm_post_reset(struct usb_interface *intf)
841 int rv; 861 int rv;
842 862
843 rv = recover_from_urb_loss(desc); 863 rv = recover_from_urb_loss(desc);
844 mutex_unlock(&desc->plock); 864 mutex_unlock(&desc->lock);
845 return 0; 865 return 0;
846} 866}
847 867
@@ -849,9 +869,11 @@ static struct usb_driver wdm_driver = {
849 .name = "cdc_wdm", 869 .name = "cdc_wdm",
850 .probe = wdm_probe, 870 .probe = wdm_probe,
851 .disconnect = wdm_disconnect, 871 .disconnect = wdm_disconnect,
872#ifdef CONFIG_PM
852 .suspend = wdm_suspend, 873 .suspend = wdm_suspend,
853 .resume = wdm_resume, 874 .resume = wdm_resume,
854 .reset_resume = wdm_resume, 875 .reset_resume = wdm_resume,
876#endif
855 .pre_reset = wdm_pre_reset, 877 .pre_reset = wdm_pre_reset,
856 .post_reset = wdm_post_reset, 878 .post_reset = wdm_post_reset,
857 .id_table = wdm_ids, 879 .id_table = wdm_ids,
diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c
index e909ff7b9094..3466fdc5bb11 100644
--- a/drivers/usb/core/devio.c
+++ b/drivers/usb/core/devio.c
@@ -1207,6 +1207,13 @@ static int proc_do_submiturb(struct dev_state *ps, struct usbdevfs_urb *uurb,
1207 free_async(as); 1207 free_async(as);
1208 return -ENOMEM; 1208 return -ENOMEM;
1209 } 1209 }
1210 /* Isochronous input data may end up being discontiguous
1211 * if some of the packets are short. Clear the buffer so
1212 * that the gaps don't leak kernel data to userspace.
1213 */
1214 if (is_in && uurb->type == USBDEVFS_URB_TYPE_ISO)
1215 memset(as->urb->transfer_buffer, 0,
1216 uurb->buffer_length);
1210 } 1217 }
1211 as->urb->dev = ps->dev; 1218 as->urb->dev = ps->dev;
1212 as->urb->pipe = (uurb->type << 30) | 1219 as->urb->pipe = (uurb->type << 30) |
@@ -1345,10 +1352,14 @@ static int processcompl(struct async *as, void __user * __user *arg)
1345 void __user *addr = as->userurb; 1352 void __user *addr = as->userurb;
1346 unsigned int i; 1353 unsigned int i;
1347 1354
1348 if (as->userbuffer && urb->actual_length) 1355 if (as->userbuffer && urb->actual_length) {
1349 if (copy_to_user(as->userbuffer, urb->transfer_buffer, 1356 if (urb->number_of_packets > 0) /* Isochronous */
1350 urb->actual_length)) 1357 i = urb->transfer_buffer_length;
1358 else /* Non-Isoc */
1359 i = urb->actual_length;
1360 if (copy_to_user(as->userbuffer, urb->transfer_buffer, i))
1351 goto err_out; 1361 goto err_out;
1362 }
1352 if (put_user(as->status, &userurb->status)) 1363 if (put_user(as->status, &userurb->status))
1353 goto err_out; 1364 goto err_out;
1354 if (put_user(urb->actual_length, &userurb->actual_length)) 1365 if (put_user(urb->actual_length, &userurb->actual_length))
diff --git a/drivers/usb/core/urb.c b/drivers/usb/core/urb.c
index 27080561a1c2..45a32dadb406 100644
--- a/drivers/usb/core/urb.c
+++ b/drivers/usb/core/urb.c
@@ -453,6 +453,7 @@ int usb_submit_urb(struct urb *urb, gfp_t mem_flags)
453 if (urb->interval > (1 << 15)) 453 if (urb->interval > (1 << 15))
454 return -EINVAL; 454 return -EINVAL;
455 max = 1 << 15; 455 max = 1 << 15;
456 break;
456 case USB_SPEED_WIRELESS: 457 case USB_SPEED_WIRELESS:
457 if (urb->interval > 16) 458 if (urb->interval > 16)
458 return -EINVAL; 459 return -EINVAL;
diff --git a/drivers/usb/gadget/Kconfig b/drivers/usb/gadget/Kconfig
index 7460cd797f45..11a3e0fa4331 100644
--- a/drivers/usb/gadget/Kconfig
+++ b/drivers/usb/gadget/Kconfig
@@ -747,7 +747,7 @@ config USB_MASS_STORAGE
747 which may be used with composite framework. 747 which may be used with composite framework.
748 748
749 Say "y" to link the driver statically, or "m" to build 749 Say "y" to link the driver statically, or "m" to build
750 a dynamically linked module called "g_file_storage". If unsure, 750 a dynamically linked module called "g_mass_storage". If unsure,
751 consider File-backed Storage Gadget. 751 consider File-backed Storage Gadget.
752 752
753config USB_G_SERIAL 753config USB_G_SERIAL
diff --git a/drivers/usb/gadget/epautoconf.c b/drivers/usb/gadget/epautoconf.c
index 65a5f94cbc04..3568de210f79 100644
--- a/drivers/usb/gadget/epautoconf.c
+++ b/drivers/usb/gadget/epautoconf.c
@@ -266,7 +266,7 @@ struct usb_ep * __init usb_ep_autoconfig (
266 } 266 }
267 267
268#ifdef CONFIG_BLACKFIN 268#ifdef CONFIG_BLACKFIN
269 } else if (gadget_is_musbhsfc(gadget) || gadget_is_musbhdrc(gadget)) { 269 } else if (gadget_is_musbhdrc(gadget)) {
270 if ((USB_ENDPOINT_XFER_BULK == type) || 270 if ((USB_ENDPOINT_XFER_BULK == type) ||
271 (USB_ENDPOINT_XFER_ISOC == type)) { 271 (USB_ENDPOINT_XFER_ISOC == type)) {
272 if (USB_DIR_IN & desc->bEndpointAddress) 272 if (USB_DIR_IN & desc->bEndpointAddress)
diff --git a/drivers/usb/gadget/f_mass_storage.c b/drivers/usb/gadget/f_mass_storage.c
index 5a3cdd08f1d0..f4911c09022e 100644
--- a/drivers/usb/gadget/f_mass_storage.c
+++ b/drivers/usb/gadget/f_mass_storage.c
@@ -2910,7 +2910,7 @@ static void fsg_unbind(struct usb_configuration *c, struct usb_function *f)
2910} 2910}
2911 2911
2912 2912
2913static int fsg_bind(struct usb_configuration *c, struct usb_function *f) 2913static int __init fsg_bind(struct usb_configuration *c, struct usb_function *f)
2914{ 2914{
2915 struct fsg_dev *fsg = fsg_from_func(f); 2915 struct fsg_dev *fsg = fsg_from_func(f);
2916 struct usb_gadget *gadget = c->cdev->gadget; 2916 struct usb_gadget *gadget = c->cdev->gadget;
@@ -2954,7 +2954,6 @@ static int fsg_bind(struct usb_configuration *c, struct usb_function *f)
2954autoconf_fail: 2954autoconf_fail:
2955 ERROR(fsg, "unable to autoconfigure all endpoints\n"); 2955 ERROR(fsg, "unable to autoconfigure all endpoints\n");
2956 rc = -ENOTSUPP; 2956 rc = -ENOTSUPP;
2957 fsg_unbind(c, f);
2958 return rc; 2957 return rc;
2959} 2958}
2960 2959
diff --git a/drivers/usb/gadget/gadget_chips.h b/drivers/usb/gadget/gadget_chips.h
index 1edbc12fff18..e511fec9f26d 100644
--- a/drivers/usb/gadget/gadget_chips.h
+++ b/drivers/usb/gadget/gadget_chips.h
@@ -136,6 +136,12 @@
136#define gadget_is_r8a66597(g) 0 136#define gadget_is_r8a66597(g) 0
137#endif 137#endif
138 138
139#ifdef CONFIG_USB_S3C_HSOTG
140#define gadget_is_s3c_hsotg(g) (!strcmp("s3c-hsotg", (g)->name))
141#else
142#define gadget_is_s3c_hsotg(g) 0
143#endif
144
139 145
140/** 146/**
141 * usb_gadget_controller_number - support bcdDevice id convention 147 * usb_gadget_controller_number - support bcdDevice id convention
@@ -192,6 +198,8 @@ static inline int usb_gadget_controller_number(struct usb_gadget *gadget)
192 return 0x24; 198 return 0x24;
193 else if (gadget_is_r8a66597(gadget)) 199 else if (gadget_is_r8a66597(gadget))
194 return 0x25; 200 return 0x25;
201 else if (gadget_is_s3c_hsotg(gadget))
202 return 0x26;
195 return -ENOENT; 203 return -ENOENT;
196} 204}
197 205
diff --git a/drivers/usb/gadget/goku_udc.c b/drivers/usb/gadget/goku_udc.c
index e8edc640381e..1088d08c7ed8 100644
--- a/drivers/usb/gadget/goku_udc.c
+++ b/drivers/usb/gadget/goku_udc.c
@@ -1768,7 +1768,7 @@ static int goku_probe(struct pci_dev *pdev, const struct pci_device_id *id)
1768 * usb_gadget_driver_{register,unregister}() must change. 1768 * usb_gadget_driver_{register,unregister}() must change.
1769 */ 1769 */
1770 if (the_controller) { 1770 if (the_controller) {
1771 WARNING(dev, "ignoring %s\n", pci_name(pdev)); 1771 pr_warning("ignoring %s\n", pci_name(pdev));
1772 return -EBUSY; 1772 return -EBUSY;
1773 } 1773 }
1774 if (!pdev->irq) { 1774 if (!pdev->irq) {
diff --git a/drivers/usb/gadget/multi.c b/drivers/usb/gadget/multi.c
index 76496f5d272c..a930d7fd7e7a 100644
--- a/drivers/usb/gadget/multi.c
+++ b/drivers/usb/gadget/multi.c
@@ -211,8 +211,6 @@ static int __init cdc_do_config(struct usb_configuration *c)
211 ret = fsg_add(c->cdev, c, fsg_common); 211 ret = fsg_add(c->cdev, c, fsg_common);
212 if (ret < 0) 212 if (ret < 0)
213 return ret; 213 return ret;
214 if (ret < 0)
215 return ret;
216 214
217 return 0; 215 return 0;
218} 216}
diff --git a/drivers/usb/host/Makefile b/drivers/usb/host/Makefile
index 4e0c67f1f51b..b6315aa47f7a 100644
--- a/drivers/usb/host/Makefile
+++ b/drivers/usb/host/Makefile
@@ -12,7 +12,7 @@ fhci-objs := fhci-hcd.o fhci-hub.o fhci-q.o fhci-mem.o \
12ifeq ($(CONFIG_FHCI_DEBUG),y) 12ifeq ($(CONFIG_FHCI_DEBUG),y)
13fhci-objs += fhci-dbg.o 13fhci-objs += fhci-dbg.o
14endif 14endif
15xhci-objs := xhci-hcd.o xhci-mem.o xhci-pci.o xhci-ring.o xhci-hub.o xhci-dbg.o 15xhci-hcd-objs := xhci.o xhci-mem.o xhci-pci.o xhci-ring.o xhci-hub.o xhci-dbg.o
16 16
17obj-$(CONFIG_USB_WHCI_HCD) += whci/ 17obj-$(CONFIG_USB_WHCI_HCD) += whci/
18 18
@@ -25,7 +25,7 @@ obj-$(CONFIG_USB_ISP1362_HCD) += isp1362-hcd.o
25obj-$(CONFIG_USB_OHCI_HCD) += ohci-hcd.o 25obj-$(CONFIG_USB_OHCI_HCD) += ohci-hcd.o
26obj-$(CONFIG_USB_UHCI_HCD) += uhci-hcd.o 26obj-$(CONFIG_USB_UHCI_HCD) += uhci-hcd.o
27obj-$(CONFIG_USB_FHCI_HCD) += fhci.o 27obj-$(CONFIG_USB_FHCI_HCD) += fhci.o
28obj-$(CONFIG_USB_XHCI_HCD) += xhci.o 28obj-$(CONFIG_USB_XHCI_HCD) += xhci-hcd.o
29obj-$(CONFIG_USB_SL811_HCD) += sl811-hcd.o 29obj-$(CONFIG_USB_SL811_HCD) += sl811-hcd.o
30obj-$(CONFIG_USB_SL811_CS) += sl811_cs.o 30obj-$(CONFIG_USB_SL811_CS) += sl811_cs.o
31obj-$(CONFIG_USB_U132_HCD) += u132-hcd.o 31obj-$(CONFIG_USB_U132_HCD) += u132-hcd.o
diff --git a/drivers/usb/host/ehci-hcd.c b/drivers/usb/host/ehci-hcd.c
index d8d6d3461d32..dc55a62859c6 100644
--- a/drivers/usb/host/ehci-hcd.c
+++ b/drivers/usb/host/ehci-hcd.c
@@ -995,7 +995,7 @@ rescan:
995 /* endpoints can be iso streams. for now, we don't 995 /* endpoints can be iso streams. for now, we don't
996 * accelerate iso completions ... so spin a while. 996 * accelerate iso completions ... so spin a while.
997 */ 997 */
998 if (qh->hw->hw_info1 == 0) { 998 if (qh->hw == NULL) {
999 ehci_vdbg (ehci, "iso delay\n"); 999 ehci_vdbg (ehci, "iso delay\n");
1000 goto idle_timeout; 1000 goto idle_timeout;
1001 } 1001 }
diff --git a/drivers/usb/host/ehci-sched.c b/drivers/usb/host/ehci-sched.c
index 39340ae00ac4..a0aaaaff2560 100644
--- a/drivers/usb/host/ehci-sched.c
+++ b/drivers/usb/host/ehci-sched.c
@@ -1123,8 +1123,8 @@ iso_stream_find (struct ehci_hcd *ehci, struct urb *urb)
1123 urb->interval); 1123 urb->interval);
1124 } 1124 }
1125 1125
1126 /* if dev->ep [epnum] is a QH, info1.maxpacket is nonzero */ 1126 /* if dev->ep [epnum] is a QH, hw is set */
1127 } else if (unlikely (stream->hw_info1 != 0)) { 1127 } else if (unlikely (stream->hw != NULL)) {
1128 ehci_dbg (ehci, "dev %s ep%d%s, not iso??\n", 1128 ehci_dbg (ehci, "dev %s ep%d%s, not iso??\n",
1129 urb->dev->devpath, epnum, 1129 urb->dev->devpath, epnum,
1130 usb_pipein(urb->pipe) ? "in" : "out"); 1130 usb_pipein(urb->pipe) ? "in" : "out");
@@ -1565,13 +1565,27 @@ itd_patch(
1565static inline void 1565static inline void
1566itd_link (struct ehci_hcd *ehci, unsigned frame, struct ehci_itd *itd) 1566itd_link (struct ehci_hcd *ehci, unsigned frame, struct ehci_itd *itd)
1567{ 1567{
1568 /* always prepend ITD/SITD ... only QH tree is order-sensitive */ 1568 union ehci_shadow *prev = &ehci->pshadow[frame];
1569 itd->itd_next = ehci->pshadow [frame]; 1569 __hc32 *hw_p = &ehci->periodic[frame];
1570 itd->hw_next = ehci->periodic [frame]; 1570 union ehci_shadow here = *prev;
1571 ehci->pshadow [frame].itd = itd; 1571 __hc32 type = 0;
1572
1573 /* skip any iso nodes which might belong to previous microframes */
1574 while (here.ptr) {
1575 type = Q_NEXT_TYPE(ehci, *hw_p);
1576 if (type == cpu_to_hc32(ehci, Q_TYPE_QH))
1577 break;
1578 prev = periodic_next_shadow(ehci, prev, type);
1579 hw_p = shadow_next_periodic(ehci, &here, type);
1580 here = *prev;
1581 }
1582
1583 itd->itd_next = here;
1584 itd->hw_next = *hw_p;
1585 prev->itd = itd;
1572 itd->frame = frame; 1586 itd->frame = frame;
1573 wmb (); 1587 wmb ();
1574 ehci->periodic[frame] = cpu_to_hc32(ehci, itd->itd_dma | Q_TYPE_ITD); 1588 *hw_p = cpu_to_hc32(ehci, itd->itd_dma | Q_TYPE_ITD);
1575} 1589}
1576 1590
1577/* fit urb's itds into the selected schedule slot; activate as needed */ 1591/* fit urb's itds into the selected schedule slot; activate as needed */
diff --git a/drivers/usb/host/ehci.h b/drivers/usb/host/ehci.h
index 2d85e21ff282..b1dce96dd621 100644
--- a/drivers/usb/host/ehci.h
+++ b/drivers/usb/host/ehci.h
@@ -394,9 +394,8 @@ struct ehci_iso_sched {
394 * acts like a qh would, if EHCI had them for ISO. 394 * acts like a qh would, if EHCI had them for ISO.
395 */ 395 */
396struct ehci_iso_stream { 396struct ehci_iso_stream {
397 /* first two fields match QH, but info1 == 0 */ 397 /* first field matches ehci_hq, but is NULL */
398 __hc32 hw_next; 398 struct ehci_qh_hw *hw;
399 __hc32 hw_info1;
400 399
401 u32 refcount; 400 u32 refcount;
402 u8 bEndpointAddress; 401 u8 bEndpointAddress;
diff --git a/drivers/usb/host/r8a66597-hcd.c b/drivers/usb/host/r8a66597-hcd.c
index bee558aed427..f71a73a93d0c 100644
--- a/drivers/usb/host/r8a66597-hcd.c
+++ b/drivers/usb/host/r8a66597-hcd.c
@@ -418,7 +418,7 @@ static u8 alloc_usb_address(struct r8a66597 *r8a66597, struct urb *urb)
418 418
419/* this function must be called with interrupt disabled */ 419/* this function must be called with interrupt disabled */
420static void free_usb_address(struct r8a66597 *r8a66597, 420static void free_usb_address(struct r8a66597 *r8a66597,
421 struct r8a66597_device *dev) 421 struct r8a66597_device *dev, int reset)
422{ 422{
423 int port; 423 int port;
424 424
@@ -430,7 +430,13 @@ static void free_usb_address(struct r8a66597 *r8a66597,
430 dev->state = USB_STATE_DEFAULT; 430 dev->state = USB_STATE_DEFAULT;
431 r8a66597->address_map &= ~(1 << dev->address); 431 r8a66597->address_map &= ~(1 << dev->address);
432 dev->address = 0; 432 dev->address = 0;
433 dev_set_drvdata(&dev->udev->dev, NULL); 433 /*
434 * Only when resetting USB, it is necessary to erase drvdata. When
435 * a usb device with usb hub is disconnect, "dev->udev" is already
436 * freed on usb_desconnect(). So we cannot access the data.
437 */
438 if (reset)
439 dev_set_drvdata(&dev->udev->dev, NULL);
434 list_del(&dev->device_list); 440 list_del(&dev->device_list);
435 kfree(dev); 441 kfree(dev);
436 442
@@ -1069,7 +1075,7 @@ static void r8a66597_usb_disconnect(struct r8a66597 *r8a66597, int port)
1069 struct r8a66597_device *dev = r8a66597->root_hub[port].dev; 1075 struct r8a66597_device *dev = r8a66597->root_hub[port].dev;
1070 1076
1071 disable_r8a66597_pipe_all(r8a66597, dev); 1077 disable_r8a66597_pipe_all(r8a66597, dev);
1072 free_usb_address(r8a66597, dev); 1078 free_usb_address(r8a66597, dev, 0);
1073 1079
1074 start_root_hub_sampling(r8a66597, port, 0); 1080 start_root_hub_sampling(r8a66597, port, 0);
1075} 1081}
@@ -2085,7 +2091,7 @@ static void update_usb_address_map(struct r8a66597 *r8a66597,
2085 spin_lock_irqsave(&r8a66597->lock, flags); 2091 spin_lock_irqsave(&r8a66597->lock, flags);
2086 dev = get_r8a66597_device(r8a66597, addr); 2092 dev = get_r8a66597_device(r8a66597, addr);
2087 disable_r8a66597_pipe_all(r8a66597, dev); 2093 disable_r8a66597_pipe_all(r8a66597, dev);
2088 free_usb_address(r8a66597, dev); 2094 free_usb_address(r8a66597, dev, 0);
2089 put_child_connect_map(r8a66597, addr); 2095 put_child_connect_map(r8a66597, addr);
2090 spin_unlock_irqrestore(&r8a66597->lock, flags); 2096 spin_unlock_irqrestore(&r8a66597->lock, flags);
2091 } 2097 }
@@ -2228,7 +2234,7 @@ static int r8a66597_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue,
2228 rh->port |= (1 << USB_PORT_FEAT_RESET); 2234 rh->port |= (1 << USB_PORT_FEAT_RESET);
2229 2235
2230 disable_r8a66597_pipe_all(r8a66597, dev); 2236 disable_r8a66597_pipe_all(r8a66597, dev);
2231 free_usb_address(r8a66597, dev); 2237 free_usb_address(r8a66597, dev, 1);
2232 2238
2233 r8a66597_mdfy(r8a66597, USBRST, USBRST | UACT, 2239 r8a66597_mdfy(r8a66597, USBRST, USBRST | UACT,
2234 get_dvstctr_reg(port)); 2240 get_dvstctr_reg(port));
diff --git a/drivers/usb/host/xhci-mem.c b/drivers/usb/host/xhci-mem.c
index 49f7d72f8b1b..bba9b19ed1b9 100644
--- a/drivers/usb/host/xhci-mem.c
+++ b/drivers/usb/host/xhci-mem.c
@@ -566,8 +566,13 @@ static inline unsigned int xhci_get_endpoint_interval(struct usb_device *udev,
566 if (interval < 3) 566 if (interval < 3)
567 interval = 3; 567 interval = 3;
568 if ((1 << interval) != 8*ep->desc.bInterval) 568 if ((1 << interval) != 8*ep->desc.bInterval)
569 dev_warn(&udev->dev, "ep %#x - rounding interval to %d microframes\n", 569 dev_warn(&udev->dev,
570 ep->desc.bEndpointAddress, 1 << interval); 570 "ep %#x - rounding interval"
571 " to %d microframes, "
572 "ep desc says %d microframes\n",
573 ep->desc.bEndpointAddress,
574 1 << interval,
575 8*ep->desc.bInterval);
571 } 576 }
572 break; 577 break;
573 default: 578 default:
diff --git a/drivers/usb/host/xhci-hcd.c b/drivers/usb/host/xhci.c
index 4cb69e0af834..492a61c2c79d 100644
--- a/drivers/usb/host/xhci-hcd.c
+++ b/drivers/usb/host/xhci.c
@@ -1173,6 +1173,7 @@ static int xhci_configure_endpoint(struct xhci_hcd *xhci,
1173 cmd_completion = &virt_dev->cmd_completion; 1173 cmd_completion = &virt_dev->cmd_completion;
1174 cmd_status = &virt_dev->cmd_status; 1174 cmd_status = &virt_dev->cmd_status;
1175 } 1175 }
1176 init_completion(cmd_completion);
1176 1177
1177 if (!ctx_change) 1178 if (!ctx_change)
1178 ret = xhci_queue_configure_endpoint(xhci, in_ctx->dma, 1179 ret = xhci_queue_configure_endpoint(xhci, in_ctx->dma,
diff --git a/drivers/usb/musb/musb_core.c b/drivers/usb/musb/musb_core.c
index b4bbf8f2c238..0e8b8ab1d168 100644
--- a/drivers/usb/musb/musb_core.c
+++ b/drivers/usb/musb/musb_core.c
@@ -379,7 +379,6 @@ static irqreturn_t musb_stage0_irq(struct musb *musb, u8 int_usb,
379 u8 devctl, u8 power) 379 u8 devctl, u8 power)
380{ 380{
381 irqreturn_t handled = IRQ_NONE; 381 irqreturn_t handled = IRQ_NONE;
382 void __iomem *mbase = musb->mregs;
383 382
384 DBG(3, "<== Power=%02x, DevCtl=%02x, int_usb=0x%x\n", power, devctl, 383 DBG(3, "<== Power=%02x, DevCtl=%02x, int_usb=0x%x\n", power, devctl,
385 int_usb); 384 int_usb);
@@ -394,6 +393,8 @@ static irqreturn_t musb_stage0_irq(struct musb *musb, u8 int_usb,
394 393
395 if (devctl & MUSB_DEVCTL_HM) { 394 if (devctl & MUSB_DEVCTL_HM) {
396#ifdef CONFIG_USB_MUSB_HDRC_HCD 395#ifdef CONFIG_USB_MUSB_HDRC_HCD
396 void __iomem *mbase = musb->mregs;
397
397 switch (musb->xceiv->state) { 398 switch (musb->xceiv->state) {
398 case OTG_STATE_A_SUSPEND: 399 case OTG_STATE_A_SUSPEND:
399 /* remote wakeup? later, GetPortStatus 400 /* remote wakeup? later, GetPortStatus
@@ -471,6 +472,8 @@ static irqreturn_t musb_stage0_irq(struct musb *musb, u8 int_usb,
471#ifdef CONFIG_USB_MUSB_HDRC_HCD 472#ifdef CONFIG_USB_MUSB_HDRC_HCD
472 /* see manual for the order of the tests */ 473 /* see manual for the order of the tests */
473 if (int_usb & MUSB_INTR_SESSREQ) { 474 if (int_usb & MUSB_INTR_SESSREQ) {
475 void __iomem *mbase = musb->mregs;
476
474 DBG(1, "SESSION_REQUEST (%s)\n", otg_state_string(musb)); 477 DBG(1, "SESSION_REQUEST (%s)\n", otg_state_string(musb));
475 478
476 /* IRQ arrives from ID pin sense or (later, if VBUS power 479 /* IRQ arrives from ID pin sense or (later, if VBUS power
@@ -519,6 +522,8 @@ static irqreturn_t musb_stage0_irq(struct musb *musb, u8 int_usb,
519 case OTG_STATE_A_WAIT_BCON: 522 case OTG_STATE_A_WAIT_BCON:
520 case OTG_STATE_A_WAIT_VRISE: 523 case OTG_STATE_A_WAIT_VRISE:
521 if (musb->vbuserr_retry) { 524 if (musb->vbuserr_retry) {
525 void __iomem *mbase = musb->mregs;
526
522 musb->vbuserr_retry--; 527 musb->vbuserr_retry--;
523 ignore = 1; 528 ignore = 1;
524 devctl |= MUSB_DEVCTL_SESSION; 529 devctl |= MUSB_DEVCTL_SESSION;
@@ -622,6 +627,7 @@ static irqreturn_t musb_stage0_irq(struct musb *musb, u8 int_usb,
622 627
623 if (int_usb & MUSB_INTR_CONNECT) { 628 if (int_usb & MUSB_INTR_CONNECT) {
624 struct usb_hcd *hcd = musb_to_hcd(musb); 629 struct usb_hcd *hcd = musb_to_hcd(musb);
630 void __iomem *mbase = musb->mregs;
625 631
626 handled = IRQ_HANDLED; 632 handled = IRQ_HANDLED;
627 musb->is_active = 1; 633 musb->is_active = 1;
@@ -2007,7 +2013,6 @@ bad_config:
2007 /* host side needs more setup */ 2013 /* host side needs more setup */
2008 if (is_host_enabled(musb)) { 2014 if (is_host_enabled(musb)) {
2009 struct usb_hcd *hcd = musb_to_hcd(musb); 2015 struct usb_hcd *hcd = musb_to_hcd(musb);
2010 u8 busctl;
2011 2016
2012 otg_set_host(musb->xceiv, &hcd->self); 2017 otg_set_host(musb->xceiv, &hcd->self);
2013 2018
@@ -2018,9 +2023,9 @@ bad_config:
2018 2023
2019 /* program PHY to use external vBus if required */ 2024 /* program PHY to use external vBus if required */
2020 if (plat->extvbus) { 2025 if (plat->extvbus) {
2021 busctl = musb_readb(musb->mregs, MUSB_ULPI_BUSCONTROL); 2026 u8 busctl = musb_read_ulpi_buscontrol(musb->mregs);
2022 busctl |= MUSB_ULPI_USE_EXTVBUS; 2027 busctl |= MUSB_ULPI_USE_EXTVBUS;
2023 musb_writeb(musb->mregs, MUSB_ULPI_BUSCONTROL, busctl); 2028 musb_write_ulpi_buscontrol(musb->mregs, busctl);
2024 } 2029 }
2025 } 2030 }
2026 2031
diff --git a/drivers/usb/musb/musb_core.h b/drivers/usb/musb/musb_core.h
index d849fb81c131..cd9f4a9a06c6 100644
--- a/drivers/usb/musb/musb_core.h
+++ b/drivers/usb/musb/musb_core.h
@@ -469,7 +469,7 @@ struct musb_csr_regs {
469 469
470struct musb_context_registers { 470struct musb_context_registers {
471 471
472#if defined(CONFIG_ARCH_OMAP34XX) || defined(CONFIG_ARCH_OMAP2430) 472#ifdef CONFIG_PM
473 u32 otg_sysconfig, otg_forcestandby; 473 u32 otg_sysconfig, otg_forcestandby;
474#endif 474#endif
475 u8 power; 475 u8 power;
@@ -483,7 +483,7 @@ struct musb_context_registers {
483 struct musb_csr_regs index_regs[MUSB_C_NUM_EPS]; 483 struct musb_csr_regs index_regs[MUSB_C_NUM_EPS];
484}; 484};
485 485
486#if defined(CONFIG_ARCH_OMAP34XX) || defined(CONFIG_ARCH_OMAP2430) 486#ifdef CONFIG_PM
487extern void musb_platform_save_context(struct musb *musb, 487extern void musb_platform_save_context(struct musb *musb,
488 struct musb_context_registers *musb_context); 488 struct musb_context_registers *musb_context);
489extern void musb_platform_restore_context(struct musb *musb, 489extern void musb_platform_restore_context(struct musb *musb,
diff --git a/drivers/usb/musb/musb_host.c b/drivers/usb/musb/musb_host.c
index 3421cf9858b5..dec896e888db 100644
--- a/drivers/usb/musb/musb_host.c
+++ b/drivers/usb/musb/musb_host.c
@@ -1689,7 +1689,7 @@ void musb_host_rx(struct musb *musb, u8 epnum)
1689 dma->desired_mode = 1; 1689 dma->desired_mode = 1;
1690 if (rx_count < hw_ep->max_packet_sz_rx) { 1690 if (rx_count < hw_ep->max_packet_sz_rx) {
1691 length = rx_count; 1691 length = rx_count;
1692 dma->bDesiredMode = 0; 1692 dma->desired_mode = 0;
1693 } else { 1693 } else {
1694 length = urb->transfer_buffer_length; 1694 length = urb->transfer_buffer_length;
1695 } 1695 }
diff --git a/drivers/usb/musb/musb_regs.h b/drivers/usb/musb/musb_regs.h
index 8d8062b10e2f..fa55aacc385d 100644
--- a/drivers/usb/musb/musb_regs.h
+++ b/drivers/usb/musb/musb_regs.h
@@ -326,6 +326,11 @@ static inline void musb_write_rxfifoadd(void __iomem *mbase, u16 c_off)
326 musb_writew(mbase, MUSB_RXFIFOADD, c_off); 326 musb_writew(mbase, MUSB_RXFIFOADD, c_off);
327} 327}
328 328
329static inline void musb_write_ulpi_buscontrol(void __iomem *mbase, u8 val)
330{
331 musb_writeb(mbase, MUSB_ULPI_BUSCONTROL, val);
332}
333
329static inline u8 musb_read_txfifosz(void __iomem *mbase) 334static inline u8 musb_read_txfifosz(void __iomem *mbase)
330{ 335{
331 return musb_readb(mbase, MUSB_TXFIFOSZ); 336 return musb_readb(mbase, MUSB_TXFIFOSZ);
@@ -346,6 +351,11 @@ static inline u16 musb_read_rxfifoadd(void __iomem *mbase)
346 return musb_readw(mbase, MUSB_RXFIFOADD); 351 return musb_readw(mbase, MUSB_RXFIFOADD);
347} 352}
348 353
354static inline u8 musb_read_ulpi_buscontrol(void __iomem *mbase)
355{
356 return musb_readb(mbase, MUSB_ULPI_BUSCONTROL);
357}
358
349static inline u8 musb_read_configdata(void __iomem *mbase) 359static inline u8 musb_read_configdata(void __iomem *mbase)
350{ 360{
351 musb_writeb(mbase, MUSB_INDEX, 0); 361 musb_writeb(mbase, MUSB_INDEX, 0);
@@ -510,20 +520,33 @@ static inline void musb_write_rxfifoadd(void __iomem *mbase, u16 c_off)
510{ 520{
511} 521}
512 522
523static inline void musb_write_ulpi_buscontrol(void __iomem *mbase, u8 val)
524{
525}
526
513static inline u8 musb_read_txfifosz(void __iomem *mbase) 527static inline u8 musb_read_txfifosz(void __iomem *mbase)
514{ 528{
529 return 0;
515} 530}
516 531
517static inline u16 musb_read_txfifoadd(void __iomem *mbase) 532static inline u16 musb_read_txfifoadd(void __iomem *mbase)
518{ 533{
534 return 0;
519} 535}
520 536
521static inline u8 musb_read_rxfifosz(void __iomem *mbase) 537static inline u8 musb_read_rxfifosz(void __iomem *mbase)
522{ 538{
539 return 0;
523} 540}
524 541
525static inline u16 musb_read_rxfifoadd(void __iomem *mbase) 542static inline u16 musb_read_rxfifoadd(void __iomem *mbase)
526{ 543{
544 return 0;
545}
546
547static inline u8 musb_read_ulpi_buscontrol(void __iomem *mbase)
548{
549 return 0;
527} 550}
528 551
529static inline u8 musb_read_configdata(void __iomem *mbase) 552static inline u8 musb_read_configdata(void __iomem *mbase)
@@ -577,22 +600,27 @@ static inline void musb_write_txhubport(void __iomem *mbase, u8 epnum,
577 600
578static inline u8 musb_read_rxfunaddr(void __iomem *mbase, u8 epnum) 601static inline u8 musb_read_rxfunaddr(void __iomem *mbase, u8 epnum)
579{ 602{
603 return 0;
580} 604}
581 605
582static inline u8 musb_read_rxhubaddr(void __iomem *mbase, u8 epnum) 606static inline u8 musb_read_rxhubaddr(void __iomem *mbase, u8 epnum)
583{ 607{
608 return 0;
584} 609}
585 610
586static inline u8 musb_read_rxhubport(void __iomem *mbase, u8 epnum) 611static inline u8 musb_read_rxhubport(void __iomem *mbase, u8 epnum)
587{ 612{
613 return 0;
588} 614}
589 615
590static inline u8 musb_read_txfunaddr(void __iomem *mbase, u8 epnum) 616static inline u8 musb_read_txfunaddr(void __iomem *mbase, u8 epnum)
591{ 617{
618 return 0;
592} 619}
593 620
594static inline u8 musb_read_txhubaddr(void __iomem *mbase, u8 epnum) 621static inline u8 musb_read_txhubaddr(void __iomem *mbase, u8 epnum)
595{ 622{
623 return 0;
596} 624}
597 625
598static inline void musb_read_txhubport(void __iomem *mbase, u8 epnum) 626static inline void musb_read_txhubport(void __iomem *mbase, u8 epnum)
diff --git a/drivers/usb/serial/Kconfig b/drivers/usb/serial/Kconfig
index c78b255e3f83..a0ecb42cb33a 100644
--- a/drivers/usb/serial/Kconfig
+++ b/drivers/usb/serial/Kconfig
@@ -474,14 +474,14 @@ config USB_SERIAL_OTI6858
474 474
475config USB_SERIAL_QCAUX 475config USB_SERIAL_QCAUX
476 tristate "USB Qualcomm Auxiliary Serial Port Driver" 476 tristate "USB Qualcomm Auxiliary Serial Port Driver"
477 ---help--- 477 help
478 Say Y here if you want to use the auxiliary serial ports provided 478 Say Y here if you want to use the auxiliary serial ports provided
479 by many modems based on Qualcomm chipsets. These ports often use 479 by many modems based on Qualcomm chipsets. These ports often use
480 a proprietary protocol called DM and cannot be used for AT- or 480 a proprietary protocol called DM and cannot be used for AT- or
481 PPP-based communication. 481 PPP-based communication.
482 482
483 To compile this driver as a module, choose M here: the 483 To compile this driver as a module, choose M here: the
484 module will be called moto_modem. If unsure, choose N. 484 module will be called qcaux. If unsure, choose N.
485 485
486config USB_SERIAL_QUALCOMM 486config USB_SERIAL_QUALCOMM
487 tristate "USB Qualcomm Serial modem" 487 tristate "USB Qualcomm Serial modem"
diff --git a/drivers/usb/serial/console.c b/drivers/usb/serial/console.c
index b22ac3258523..f347da2ef00a 100644
--- a/drivers/usb/serial/console.c
+++ b/drivers/usb/serial/console.c
@@ -181,6 +181,7 @@ static int usb_console_setup(struct console *co, char *options)
181 /* The console is special in terms of closing the device so 181 /* The console is special in terms of closing the device so
182 * indicate this port is now acting as a system console. */ 182 * indicate this port is now acting as a system console. */
183 port->console = 1; 183 port->console = 1;
184 port->port.console = 1;
184 185
185 mutex_unlock(&serial->disc_mutex); 186 mutex_unlock(&serial->disc_mutex);
186 return retval; 187 return retval;
diff --git a/drivers/usb/serial/cp210x.c b/drivers/usb/serial/cp210x.c
index 507382b0a9ed..ec9b0449ccf6 100644
--- a/drivers/usb/serial/cp210x.c
+++ b/drivers/usb/serial/cp210x.c
@@ -313,11 +313,6 @@ static int cp210x_set_config(struct usb_serial_port *port, u8 request,
313 return -EPROTO; 313 return -EPROTO;
314 } 314 }
315 315
316 /* Single data value */
317 result = usb_control_msg(serial->dev,
318 usb_sndctrlpipe(serial->dev, 0),
319 request, REQTYPE_HOST_TO_DEVICE, data[0],
320 0, NULL, 0, 300);
321 return 0; 316 return 0;
322} 317}
323 318
diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c
index 6af0dfa5f5ac..1d7c4fac02e8 100644
--- a/drivers/usb/serial/ftdi_sio.c
+++ b/drivers/usb/serial/ftdi_sio.c
@@ -91,7 +91,7 @@ struct ftdi_private {
91 unsigned long tx_outstanding_bytes; 91 unsigned long tx_outstanding_bytes;
92 unsigned long tx_outstanding_urbs; 92 unsigned long tx_outstanding_urbs;
93 unsigned short max_packet_size; 93 unsigned short max_packet_size;
94 struct mutex cfg_lock; /* Avoid mess by parallel calls of config ioctl() */ 94 struct mutex cfg_lock; /* Avoid mess by parallel calls of config ioctl() and change_speed() */
95}; 95};
96 96
97/* struct ftdi_sio_quirk is used by devices requiring special attention. */ 97/* struct ftdi_sio_quirk is used by devices requiring special attention. */
@@ -658,6 +658,7 @@ static struct usb_device_id id_table_combined [] = {
658 { USB_DEVICE(EVOLUTION_VID, EVOLUTION_ER1_PID) }, 658 { USB_DEVICE(EVOLUTION_VID, EVOLUTION_ER1_PID) },
659 { USB_DEVICE(EVOLUTION_VID, EVO_HYBRID_PID) }, 659 { USB_DEVICE(EVOLUTION_VID, EVO_HYBRID_PID) },
660 { USB_DEVICE(EVOLUTION_VID, EVO_RCM4_PID) }, 660 { USB_DEVICE(EVOLUTION_VID, EVO_RCM4_PID) },
661 { USB_DEVICE(CONTEC_VID, CONTEC_COM1USBH_PID) },
661 { USB_DEVICE(FTDI_VID, FTDI_ARTEMIS_PID) }, 662 { USB_DEVICE(FTDI_VID, FTDI_ARTEMIS_PID) },
662 { USB_DEVICE(FTDI_VID, FTDI_ATIK_ATK16_PID) }, 663 { USB_DEVICE(FTDI_VID, FTDI_ATIK_ATK16_PID) },
663 { USB_DEVICE(FTDI_VID, FTDI_ATIK_ATK16C_PID) }, 664 { USB_DEVICE(FTDI_VID, FTDI_ATIK_ATK16C_PID) },
@@ -1272,8 +1273,8 @@ check_and_exit:
1272 (priv->flags & ASYNC_SPD_MASK)) || 1273 (priv->flags & ASYNC_SPD_MASK)) ||
1273 (((priv->flags & ASYNC_SPD_MASK) == ASYNC_SPD_CUST) && 1274 (((priv->flags & ASYNC_SPD_MASK) == ASYNC_SPD_CUST) &&
1274 (old_priv.custom_divisor != priv->custom_divisor))) { 1275 (old_priv.custom_divisor != priv->custom_divisor))) {
1275 mutex_unlock(&priv->cfg_lock);
1276 change_speed(tty, port); 1276 change_speed(tty, port);
1277 mutex_unlock(&priv->cfg_lock);
1277 } 1278 }
1278 else 1279 else
1279 mutex_unlock(&priv->cfg_lock); 1280 mutex_unlock(&priv->cfg_lock);
@@ -2264,9 +2265,11 @@ static void ftdi_set_termios(struct tty_struct *tty,
2264 clear_mctrl(port, TIOCM_DTR | TIOCM_RTS); 2265 clear_mctrl(port, TIOCM_DTR | TIOCM_RTS);
2265 } else { 2266 } else {
2266 /* set the baudrate determined before */ 2267 /* set the baudrate determined before */
2268 mutex_lock(&priv->cfg_lock);
2267 if (change_speed(tty, port)) 2269 if (change_speed(tty, port))
2268 dev_err(&port->dev, "%s urb failed to set baudrate\n", 2270 dev_err(&port->dev, "%s urb failed to set baudrate\n",
2269 __func__); 2271 __func__);
2272 mutex_unlock(&priv->cfg_lock);
2270 /* Ensure RTS and DTR are raised when baudrate changed from 0 */ 2273 /* Ensure RTS and DTR are raised when baudrate changed from 0 */
2271 if (!old_termios || (old_termios->c_cflag & CBAUD) == B0) 2274 if (!old_termios || (old_termios->c_cflag & CBAUD) == B0)
2272 set_mctrl(port, TIOCM_DTR | TIOCM_RTS); 2275 set_mctrl(port, TIOCM_DTR | TIOCM_RTS);
diff --git a/drivers/usb/serial/ftdi_sio_ids.h b/drivers/usb/serial/ftdi_sio_ids.h
index 0727e198503e..75482cbc3998 100644
--- a/drivers/usb/serial/ftdi_sio_ids.h
+++ b/drivers/usb/serial/ftdi_sio_ids.h
@@ -501,6 +501,13 @@
501#define CONTEC_COM1USBH_PID 0x8311 /* COM-1(USB)H */ 501#define CONTEC_COM1USBH_PID 0x8311 /* COM-1(USB)H */
502 502
503/* 503/*
504 * Contec products (http://www.contec.com)
505 * Submitted by Daniel Sangorrin
506 */
507#define CONTEC_VID 0x06CE /* Vendor ID */
508#define CONTEC_COM1USBH_PID 0x8311 /* COM-1(USB)H */
509
510/*
504 * Definitions for B&B Electronics products. 511 * Definitions for B&B Electronics products.
505 */ 512 */
506#define BANDB_VID 0x0856 /* B&B Electronics Vendor ID */ 513#define BANDB_VID 0x0856 /* B&B Electronics Vendor ID */
diff --git a/drivers/usb/serial/generic.c b/drivers/usb/serial/generic.c
index 89fac36684c5..f804acb138ec 100644
--- a/drivers/usb/serial/generic.c
+++ b/drivers/usb/serial/generic.c
@@ -130,7 +130,7 @@ int usb_serial_generic_open(struct tty_struct *tty, struct usb_serial_port *port
130 spin_unlock_irqrestore(&port->lock, flags); 130 spin_unlock_irqrestore(&port->lock, flags);
131 131
132 /* if we have a bulk endpoint, start reading from it */ 132 /* if we have a bulk endpoint, start reading from it */
133 if (serial->num_bulk_in) { 133 if (port->bulk_in_size) {
134 /* Start reading from the device */ 134 /* Start reading from the device */
135 usb_fill_bulk_urb(port->read_urb, serial->dev, 135 usb_fill_bulk_urb(port->read_urb, serial->dev,
136 usb_rcvbulkpipe(serial->dev, 136 usb_rcvbulkpipe(serial->dev,
@@ -159,10 +159,10 @@ static void generic_cleanup(struct usb_serial_port *port)
159 dbg("%s - port %d", __func__, port->number); 159 dbg("%s - port %d", __func__, port->number);
160 160
161 if (serial->dev) { 161 if (serial->dev) {
162 /* shutdown any bulk reads that might be going on */ 162 /* shutdown any bulk transfers that might be going on */
163 if (serial->num_bulk_out) 163 if (port->bulk_out_size)
164 usb_kill_urb(port->write_urb); 164 usb_kill_urb(port->write_urb);
165 if (serial->num_bulk_in) 165 if (port->bulk_in_size)
166 usb_kill_urb(port->read_urb); 166 usb_kill_urb(port->read_urb);
167 } 167 }
168} 168}
@@ -333,15 +333,15 @@ int usb_serial_generic_write(struct tty_struct *tty,
333 333
334 dbg("%s - port %d", __func__, port->number); 334 dbg("%s - port %d", __func__, port->number);
335 335
336 /* only do something if we have a bulk out endpoint */
337 if (!port->bulk_out_size)
338 return -ENODEV;
339
336 if (count == 0) { 340 if (count == 0) {
337 dbg("%s - write request of 0 bytes", __func__); 341 dbg("%s - write request of 0 bytes", __func__);
338 return 0; 342 return 0;
339 } 343 }
340 344
341 /* only do something if we have a bulk out endpoint */
342 if (!serial->num_bulk_out)
343 return 0;
344
345 if (serial->type->max_in_flight_urbs) 345 if (serial->type->max_in_flight_urbs)
346 return usb_serial_multi_urb_write(tty, port, 346 return usb_serial_multi_urb_write(tty, port,
347 buf, count); 347 buf, count);
@@ -364,14 +364,19 @@ int usb_serial_generic_write_room(struct tty_struct *tty)
364 int room = 0; 364 int room = 0;
365 365
366 dbg("%s - port %d", __func__, port->number); 366 dbg("%s - port %d", __func__, port->number);
367
368 if (!port->bulk_out_size)
369 return 0;
370
367 spin_lock_irqsave(&port->lock, flags); 371 spin_lock_irqsave(&port->lock, flags);
368 if (serial->type->max_in_flight_urbs) { 372 if (serial->type->max_in_flight_urbs) {
369 if (port->urbs_in_flight < serial->type->max_in_flight_urbs) 373 if (port->urbs_in_flight < serial->type->max_in_flight_urbs)
370 room = port->bulk_out_size * 374 room = port->bulk_out_size *
371 (serial->type->max_in_flight_urbs - 375 (serial->type->max_in_flight_urbs -
372 port->urbs_in_flight); 376 port->urbs_in_flight);
373 } else if (serial->num_bulk_out) 377 } else {
374 room = kfifo_avail(&port->write_fifo); 378 room = kfifo_avail(&port->write_fifo);
379 }
375 spin_unlock_irqrestore(&port->lock, flags); 380 spin_unlock_irqrestore(&port->lock, flags);
376 381
377 dbg("%s - returns %d", __func__, room); 382 dbg("%s - returns %d", __func__, room);
@@ -382,15 +387,18 @@ int usb_serial_generic_chars_in_buffer(struct tty_struct *tty)
382{ 387{
383 struct usb_serial_port *port = tty->driver_data; 388 struct usb_serial_port *port = tty->driver_data;
384 struct usb_serial *serial = port->serial; 389 struct usb_serial *serial = port->serial;
385 int chars = 0;
386 unsigned long flags; 390 unsigned long flags;
391 int chars;
387 392
388 dbg("%s - port %d", __func__, port->number); 393 dbg("%s - port %d", __func__, port->number);
389 394
395 if (!port->bulk_out_size)
396 return 0;
397
390 spin_lock_irqsave(&port->lock, flags); 398 spin_lock_irqsave(&port->lock, flags);
391 if (serial->type->max_in_flight_urbs) 399 if (serial->type->max_in_flight_urbs)
392 chars = port->tx_bytes_flight; 400 chars = port->tx_bytes_flight;
393 else if (serial->num_bulk_out) 401 else
394 chars = kfifo_len(&port->write_fifo); 402 chars = kfifo_len(&port->write_fifo);
395 spin_unlock_irqrestore(&port->lock, flags); 403 spin_unlock_irqrestore(&port->lock, flags);
396 404
@@ -415,11 +423,13 @@ void usb_serial_generic_resubmit_read_urb(struct usb_serial_port *port,
415 ((serial->type->read_bulk_callback) ? 423 ((serial->type->read_bulk_callback) ?
416 serial->type->read_bulk_callback : 424 serial->type->read_bulk_callback :
417 usb_serial_generic_read_bulk_callback), port); 425 usb_serial_generic_read_bulk_callback), port);
426
418 result = usb_submit_urb(urb, mem_flags); 427 result = usb_submit_urb(urb, mem_flags);
419 if (result) 428 if (result && result != -EPERM) {
420 dev_err(&port->dev, 429 dev_err(&port->dev,
421 "%s - failed resubmitting read urb, error %d\n", 430 "%s - failed resubmitting read urb, error %d\n",
422 __func__, result); 431 __func__, result);
432 }
423} 433}
424EXPORT_SYMBOL_GPL(usb_serial_generic_resubmit_read_urb); 434EXPORT_SYMBOL_GPL(usb_serial_generic_resubmit_read_urb);
425 435
@@ -498,23 +508,18 @@ void usb_serial_generic_write_bulk_callback(struct urb *urb)
498 if (port->urbs_in_flight < 0) 508 if (port->urbs_in_flight < 0)
499 port->urbs_in_flight = 0; 509 port->urbs_in_flight = 0;
500 spin_unlock_irqrestore(&port->lock, flags); 510 spin_unlock_irqrestore(&port->lock, flags);
501
502 if (status) {
503 dbg("%s - nonzero multi-urb write bulk status "
504 "received: %d", __func__, status);
505 return;
506 }
507 } else { 511 } else {
508 port->write_urb_busy = 0; 512 port->write_urb_busy = 0;
509 513
510 if (status) { 514 if (status)
511 dbg("%s - nonzero multi-urb write bulk status "
512 "received: %d", __func__, status);
513 kfifo_reset_out(&port->write_fifo); 515 kfifo_reset_out(&port->write_fifo);
514 } else 516 else
515 usb_serial_generic_write_start(port); 517 usb_serial_generic_write_start(port);
516 } 518 }
517 519
520 if (status)
521 dbg("%s - non-zero urb status: %d", __func__, status);
522
518 usb_serial_port_softint(port); 523 usb_serial_port_softint(port);
519} 524}
520EXPORT_SYMBOL_GPL(usb_serial_generic_write_bulk_callback); 525EXPORT_SYMBOL_GPL(usb_serial_generic_write_bulk_callback);
diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c
index 847b805d63a3..950cb311ca94 100644
--- a/drivers/usb/serial/option.c
+++ b/drivers/usb/serial/option.c
@@ -288,7 +288,9 @@ static int option_resume(struct usb_serial *serial);
288 288
289#define QUALCOMM_VENDOR_ID 0x05C6 289#define QUALCOMM_VENDOR_ID 0x05C6
290 290
291#define MAXON_VENDOR_ID 0x16d8 291#define CMOTECH_VENDOR_ID 0x16d8
292#define CMOTECH_PRODUCT_6008 0x6008
293#define CMOTECH_PRODUCT_6280 0x6280
292 294
293#define TELIT_VENDOR_ID 0x1bc7 295#define TELIT_VENDOR_ID 0x1bc7
294#define TELIT_PRODUCT_UC864E 0x1003 296#define TELIT_PRODUCT_UC864E 0x1003
@@ -309,6 +311,7 @@ static int option_resume(struct usb_serial *serial);
309#define DLINK_VENDOR_ID 0x1186 311#define DLINK_VENDOR_ID 0x1186
310#define DLINK_PRODUCT_DWM_652 0x3e04 312#define DLINK_PRODUCT_DWM_652 0x3e04
311#define DLINK_PRODUCT_DWM_652_U5 0xce16 313#define DLINK_PRODUCT_DWM_652_U5 0xce16
314#define DLINK_PRODUCT_DWM_652_U5A 0xce1e
312 315
313#define QISDA_VENDOR_ID 0x1da5 316#define QISDA_VENDOR_ID 0x1da5
314#define QISDA_PRODUCT_H21_4512 0x4512 317#define QISDA_PRODUCT_H21_4512 0x4512
@@ -332,6 +335,24 @@ static int option_resume(struct usb_serial *serial);
332#define ALCATEL_VENDOR_ID 0x1bbb 335#define ALCATEL_VENDOR_ID 0x1bbb
333#define ALCATEL_PRODUCT_X060S 0x0000 336#define ALCATEL_PRODUCT_X060S 0x0000
334 337
338#define PIRELLI_VENDOR_ID 0x1266
339#define PIRELLI_PRODUCT_C100_1 0x1002
340#define PIRELLI_PRODUCT_C100_2 0x1003
341#define PIRELLI_PRODUCT_1004 0x1004
342#define PIRELLI_PRODUCT_1005 0x1005
343#define PIRELLI_PRODUCT_1006 0x1006
344#define PIRELLI_PRODUCT_1007 0x1007
345#define PIRELLI_PRODUCT_1008 0x1008
346#define PIRELLI_PRODUCT_1009 0x1009
347#define PIRELLI_PRODUCT_100A 0x100a
348#define PIRELLI_PRODUCT_100B 0x100b
349#define PIRELLI_PRODUCT_100C 0x100c
350#define PIRELLI_PRODUCT_100D 0x100d
351#define PIRELLI_PRODUCT_100E 0x100e
352#define PIRELLI_PRODUCT_100F 0x100f
353#define PIRELLI_PRODUCT_1011 0x1011
354#define PIRELLI_PRODUCT_1012 0x1012
355
335/* Airplus products */ 356/* Airplus products */
336#define AIRPLUS_VENDOR_ID 0x1011 357#define AIRPLUS_VENDOR_ID 0x1011
337#define AIRPLUS_PRODUCT_MCD650 0x3198 358#define AIRPLUS_PRODUCT_MCD650 0x3198
@@ -547,7 +568,8 @@ static const struct usb_device_id option_ids[] = {
547 { USB_DEVICE(KYOCERA_VENDOR_ID, KYOCERA_PRODUCT_KPC680) }, 568 { USB_DEVICE(KYOCERA_VENDOR_ID, KYOCERA_PRODUCT_KPC680) },
548 { USB_DEVICE(QUALCOMM_VENDOR_ID, 0x6000)}, /* ZTE AC8700 */ 569 { USB_DEVICE(QUALCOMM_VENDOR_ID, 0x6000)}, /* ZTE AC8700 */
549 { USB_DEVICE(QUALCOMM_VENDOR_ID, 0x6613)}, /* Onda H600/ZTE MF330 */ 570 { USB_DEVICE(QUALCOMM_VENDOR_ID, 0x6613)}, /* Onda H600/ZTE MF330 */
550 { USB_DEVICE(MAXON_VENDOR_ID, 0x6280) }, /* BP3-USB & BP3-EXT HSDPA */ 571 { USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_6280) }, /* BP3-USB & BP3-EXT HSDPA */
572 { USB_DEVICE(CMOTECH_VENDOR_ID, CMOTECH_PRODUCT_6008) },
551 { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_UC864E) }, 573 { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_UC864E) },
552 { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_UC864G) }, 574 { USB_DEVICE(TELIT_VENDOR_ID, TELIT_PRODUCT_UC864G) },
553 { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, ZTE_PRODUCT_MF622, 0xff, 0xff, 0xff) }, /* ZTE WCDMA products */ 575 { USB_DEVICE_AND_INTERFACE_INFO(ZTE_VENDOR_ID, ZTE_PRODUCT_MF622, 0xff, 0xff, 0xff) }, /* ZTE WCDMA products */
@@ -659,6 +681,7 @@ static const struct usb_device_id option_ids[] = {
659 { USB_DEVICE(BENQ_VENDOR_ID, BENQ_PRODUCT_H10) }, 681 { USB_DEVICE(BENQ_VENDOR_ID, BENQ_PRODUCT_H10) },
660 { USB_DEVICE(DLINK_VENDOR_ID, DLINK_PRODUCT_DWM_652) }, 682 { USB_DEVICE(DLINK_VENDOR_ID, DLINK_PRODUCT_DWM_652) },
661 { USB_DEVICE(ALINK_VENDOR_ID, DLINK_PRODUCT_DWM_652_U5) }, /* Yes, ALINK_VENDOR_ID */ 683 { USB_DEVICE(ALINK_VENDOR_ID, DLINK_PRODUCT_DWM_652_U5) }, /* Yes, ALINK_VENDOR_ID */
684 { USB_DEVICE(ALINK_VENDOR_ID, DLINK_PRODUCT_DWM_652_U5A) },
662 { USB_DEVICE(QISDA_VENDOR_ID, QISDA_PRODUCT_H21_4512) }, 685 { USB_DEVICE(QISDA_VENDOR_ID, QISDA_PRODUCT_H21_4512) },
663 { USB_DEVICE(QISDA_VENDOR_ID, QISDA_PRODUCT_H21_4523) }, 686 { USB_DEVICE(QISDA_VENDOR_ID, QISDA_PRODUCT_H21_4523) },
664 { USB_DEVICE(QISDA_VENDOR_ID, QISDA_PRODUCT_H20_4515) }, 687 { USB_DEVICE(QISDA_VENDOR_ID, QISDA_PRODUCT_H20_4515) },
@@ -666,7 +689,6 @@ static const struct usb_device_id option_ids[] = {
666 { USB_DEVICE(TOSHIBA_VENDOR_ID, TOSHIBA_PRODUCT_G450) }, 689 { USB_DEVICE(TOSHIBA_VENDOR_ID, TOSHIBA_PRODUCT_G450) },
667 { USB_DEVICE(TOSHIBA_VENDOR_ID, TOSHIBA_PRODUCT_HSDPA_MINICARD ) }, /* Toshiba 3G HSDPA == Novatel Expedite EU870D MiniCard */ 690 { USB_DEVICE(TOSHIBA_VENDOR_ID, TOSHIBA_PRODUCT_HSDPA_MINICARD ) }, /* Toshiba 3G HSDPA == Novatel Expedite EU870D MiniCard */
668 { USB_DEVICE(ALINK_VENDOR_ID, 0x9000) }, 691 { USB_DEVICE(ALINK_VENDOR_ID, 0x9000) },
669 { USB_DEVICE(ALINK_VENDOR_ID, 0xce16) },
670 { USB_DEVICE_AND_INTERFACE_INFO(ALINK_VENDOR_ID, ALINK_PRODUCT_3GU, 0xff, 0xff, 0xff) }, 692 { USB_DEVICE_AND_INTERFACE_INFO(ALINK_VENDOR_ID, ALINK_PRODUCT_3GU, 0xff, 0xff, 0xff) },
671 { USB_DEVICE(ALCATEL_VENDOR_ID, ALCATEL_PRODUCT_X060S) }, 693 { USB_DEVICE(ALCATEL_VENDOR_ID, ALCATEL_PRODUCT_X060S) },
672 { USB_DEVICE(AIRPLUS_VENDOR_ID, AIRPLUS_PRODUCT_MCD650) }, 694 { USB_DEVICE(AIRPLUS_VENDOR_ID, AIRPLUS_PRODUCT_MCD650) },
@@ -675,6 +697,24 @@ static const struct usb_device_id option_ids[] = {
675 .driver_info = (kernel_ulong_t)&four_g_w14_blacklist 697 .driver_info = (kernel_ulong_t)&four_g_w14_blacklist
676 }, 698 },
677 { USB_DEVICE(HAIER_VENDOR_ID, HAIER_PRODUCT_CE100) }, 699 { USB_DEVICE(HAIER_VENDOR_ID, HAIER_PRODUCT_CE100) },
700 /* Pirelli */
701 { USB_DEVICE(PIRELLI_VENDOR_ID, PIRELLI_PRODUCT_C100_1)},
702 { USB_DEVICE(PIRELLI_VENDOR_ID, PIRELLI_PRODUCT_C100_2)},
703 { USB_DEVICE(PIRELLI_VENDOR_ID, PIRELLI_PRODUCT_1004)},
704 { USB_DEVICE(PIRELLI_VENDOR_ID, PIRELLI_PRODUCT_1005)},
705 { USB_DEVICE(PIRELLI_VENDOR_ID, PIRELLI_PRODUCT_1006)},
706 { USB_DEVICE(PIRELLI_VENDOR_ID, PIRELLI_PRODUCT_1007)},
707 { USB_DEVICE(PIRELLI_VENDOR_ID, PIRELLI_PRODUCT_1008)},
708 { USB_DEVICE(PIRELLI_VENDOR_ID, PIRELLI_PRODUCT_1009)},
709 { USB_DEVICE(PIRELLI_VENDOR_ID, PIRELLI_PRODUCT_100A)},
710 { USB_DEVICE(PIRELLI_VENDOR_ID, PIRELLI_PRODUCT_100B) },
711 { USB_DEVICE(PIRELLI_VENDOR_ID, PIRELLI_PRODUCT_100C) },
712 { USB_DEVICE(PIRELLI_VENDOR_ID, PIRELLI_PRODUCT_100D) },
713 { USB_DEVICE(PIRELLI_VENDOR_ID, PIRELLI_PRODUCT_100E) },
714 { USB_DEVICE(PIRELLI_VENDOR_ID, PIRELLI_PRODUCT_100F) },
715 { USB_DEVICE(PIRELLI_VENDOR_ID, PIRELLI_PRODUCT_1011)},
716 { USB_DEVICE(PIRELLI_VENDOR_ID, PIRELLI_PRODUCT_1012)},
717
678 { } /* Terminating entry */ 718 { } /* Terminating entry */
679}; 719};
680MODULE_DEVICE_TABLE(usb, option_ids); 720MODULE_DEVICE_TABLE(usb, option_ids);
@@ -798,12 +838,19 @@ static int option_probe(struct usb_serial *serial,
798 const struct usb_device_id *id) 838 const struct usb_device_id *id)
799{ 839{
800 struct option_intf_private *data; 840 struct option_intf_private *data;
841
801 /* D-Link DWM 652 still exposes CD-Rom emulation interface in modem mode */ 842 /* D-Link DWM 652 still exposes CD-Rom emulation interface in modem mode */
802 if (serial->dev->descriptor.idVendor == DLINK_VENDOR_ID && 843 if (serial->dev->descriptor.idVendor == DLINK_VENDOR_ID &&
803 serial->dev->descriptor.idProduct == DLINK_PRODUCT_DWM_652 && 844 serial->dev->descriptor.idProduct == DLINK_PRODUCT_DWM_652 &&
804 serial->interface->cur_altsetting->desc.bInterfaceClass == 0x8) 845 serial->interface->cur_altsetting->desc.bInterfaceClass == 0x8)
805 return -ENODEV; 846 return -ENODEV;
806 847
848 /* Bandrich modem and AT command interface is 0xff */
849 if ((serial->dev->descriptor.idVendor == BANDRICH_VENDOR_ID ||
850 serial->dev->descriptor.idVendor == PIRELLI_VENDOR_ID) &&
851 serial->interface->cur_altsetting->desc.bInterfaceClass != 0xff)
852 return -ENODEV;
853
807 data = serial->private = kzalloc(sizeof(struct option_intf_private), GFP_KERNEL); 854 data = serial->private = kzalloc(sizeof(struct option_intf_private), GFP_KERNEL);
808 if (!data) 855 if (!data)
809 return -ENOMEM; 856 return -ENOMEM;
diff --git a/drivers/usb/serial/qcserial.c b/drivers/usb/serial/qcserial.c
index 310ff6ec6567..53a2d5a935a2 100644
--- a/drivers/usb/serial/qcserial.c
+++ b/drivers/usb/serial/qcserial.c
@@ -47,6 +47,35 @@ static const struct usb_device_id id_table[] = {
47 {USB_DEVICE(0x05c6, 0x9221)}, /* Generic Gobi QDL device */ 47 {USB_DEVICE(0x05c6, 0x9221)}, /* Generic Gobi QDL device */
48 {USB_DEVICE(0x05c6, 0x9231)}, /* Generic Gobi QDL device */ 48 {USB_DEVICE(0x05c6, 0x9231)}, /* Generic Gobi QDL device */
49 {USB_DEVICE(0x1f45, 0x0001)}, /* Unknown Gobi QDL device */ 49 {USB_DEVICE(0x1f45, 0x0001)}, /* Unknown Gobi QDL device */
50 {USB_DEVICE(0x413c, 0x8185)}, /* Dell Gobi 2000 QDL device (N0218, VU936) */
51 {USB_DEVICE(0x413c, 0x8186)}, /* Dell Gobi 2000 Modem device (N0218, VU936) */
52 {USB_DEVICE(0x05c6, 0x9224)}, /* Sony Gobi 2000 QDL device (N0279, VU730) */
53 {USB_DEVICE(0x05c6, 0x9225)}, /* Sony Gobi 2000 Modem device (N0279, VU730) */
54 {USB_DEVICE(0x05c6, 0x9244)}, /* Samsung Gobi 2000 QDL device (VL176) */
55 {USB_DEVICE(0x05c6, 0x9245)}, /* Samsung Gobi 2000 Modem device (VL176) */
56 {USB_DEVICE(0x03f0, 0x241d)}, /* HP Gobi 2000 QDL device (VP412) */
57 {USB_DEVICE(0x03f0, 0x251d)}, /* HP Gobi 2000 Modem device (VP412) */
58 {USB_DEVICE(0x05c6, 0x9214)}, /* Acer Gobi 2000 QDL device (VP413) */
59 {USB_DEVICE(0x05c6, 0x9215)}, /* Acer Gobi 2000 Modem device (VP413) */
60 {USB_DEVICE(0x05c6, 0x9264)}, /* Asus Gobi 2000 QDL device (VR305) */
61 {USB_DEVICE(0x05c6, 0x9265)}, /* Asus Gobi 2000 Modem device (VR305) */
62 {USB_DEVICE(0x05c6, 0x9234)}, /* Top Global Gobi 2000 QDL device (VR306) */
63 {USB_DEVICE(0x05c6, 0x9235)}, /* Top Global Gobi 2000 Modem device (VR306) */
64 {USB_DEVICE(0x05c6, 0x9274)}, /* iRex Technologies Gobi 2000 QDL device (VR307) */
65 {USB_DEVICE(0x05c6, 0x9275)}, /* iRex Technologies Gobi 2000 Modem device (VR307) */
66 {USB_DEVICE(0x1199, 0x9000)}, /* Sierra Wireless Gobi 2000 QDL device (VT773) */
67 {USB_DEVICE(0x1199, 0x9001)}, /* Sierra Wireless Gobi 2000 Modem device (VT773) */
68 {USB_DEVICE(0x1199, 0x9002)}, /* Sierra Wireless Gobi 2000 Modem device (VT773) */
69 {USB_DEVICE(0x1199, 0x9003)}, /* Sierra Wireless Gobi 2000 Modem device (VT773) */
70 {USB_DEVICE(0x1199, 0x9004)}, /* Sierra Wireless Gobi 2000 Modem device (VT773) */
71 {USB_DEVICE(0x1199, 0x9005)}, /* Sierra Wireless Gobi 2000 Modem device (VT773) */
72 {USB_DEVICE(0x1199, 0x9006)}, /* Sierra Wireless Gobi 2000 Modem device (VT773) */
73 {USB_DEVICE(0x1199, 0x9007)}, /* Sierra Wireless Gobi 2000 Modem device (VT773) */
74 {USB_DEVICE(0x1199, 0x9008)}, /* Sierra Wireless Gobi 2000 Modem device (VT773) */
75 {USB_DEVICE(0x1199, 0x9009)}, /* Sierra Wireless Gobi 2000 Modem device (VT773) */
76 {USB_DEVICE(0x1199, 0x900a)}, /* Sierra Wireless Gobi 2000 Modem device (VT773) */
77 {USB_DEVICE(0x16d8, 0x8001)}, /* CMDTech Gobi 2000 QDL device (VU922) */
78 {USB_DEVICE(0x16d8, 0x8002)}, /* CMDTech Gobi 2000 Modem device (VU922) */
50 { } /* Terminating entry */ 79 { } /* Terminating entry */
51}; 80};
52MODULE_DEVICE_TABLE(usb, id_table); 81MODULE_DEVICE_TABLE(usb, id_table);
diff --git a/drivers/usb/storage/unusual_devs.h b/drivers/usb/storage/unusual_devs.h
index 98b549b1cab2..ccf1dbbb87ef 100644
--- a/drivers/usb/storage/unusual_devs.h
+++ b/drivers/usb/storage/unusual_devs.h
@@ -374,6 +374,15 @@ UNUSUAL_DEV( 0x04ce, 0x0002, 0x0074, 0x0074,
374 US_SC_DEVICE, US_PR_DEVICE, NULL, 374 US_SC_DEVICE, US_PR_DEVICE, NULL,
375 US_FL_FIX_INQUIRY), 375 US_FL_FIX_INQUIRY),
376 376
377/* Reported by Ondrej Zary <linux@rainbow-software.org>
378 * The device reports one sector more and breaks when that sector is accessed
379 */
380UNUSUAL_DEV( 0x04ce, 0x0002, 0x026c, 0x026c,
381 "ScanLogic",
382 "SL11R-IDE",
383 US_SC_DEVICE, US_PR_DEVICE, NULL,
384 US_FL_FIX_CAPACITY),
385
377/* Reported by Kriston Fincher <kriston@airmail.net> 386/* Reported by Kriston Fincher <kriston@airmail.net>
378 * Patch submitted by Sean Millichamp <sean@bruenor.org> 387 * Patch submitted by Sean Millichamp <sean@bruenor.org>
379 * This is to support the Panasonic PalmCam PV-SD4090 388 * This is to support the Panasonic PalmCam PV-SD4090
@@ -1380,20 +1389,6 @@ UNUSUAL_DEV( 0x0f19, 0x0105, 0x0100, 0x0100,
1380 US_SC_DEVICE, US_PR_DEVICE, NULL, 1389 US_SC_DEVICE, US_PR_DEVICE, NULL,
1381 US_FL_IGNORE_RESIDUE ), 1390 US_FL_IGNORE_RESIDUE ),
1382 1391
1383/* Jeremy Katz <katzj@redhat.com>:
1384 * The Blackberry Pearl can run in two modes; a usb-storage only mode
1385 * and a mode that allows access via mass storage and to its database.
1386 * The berry_charge module will set the device to dual mode and thus we
1387 * should ignore its native mode if that module is built
1388 */
1389#ifdef CONFIG_USB_BERRY_CHARGE
1390UNUSUAL_DEV( 0x0fca, 0x0006, 0x0001, 0x0001,
1391 "RIM",
1392 "Blackberry Pearl",
1393 US_SC_DEVICE, US_PR_DEVICE, NULL,
1394 US_FL_IGNORE_DEVICE ),
1395#endif
1396
1397/* Reported by Michael Stattmann <michael@stattmann.com> */ 1392/* Reported by Michael Stattmann <michael@stattmann.com> */
1398UNUSUAL_DEV( 0x0fce, 0xd008, 0x0000, 0x0000, 1393UNUSUAL_DEV( 0x0fce, 0xd008, 0x0000, 0x0000,
1399 "Sony Ericsson", 1394 "Sony Ericsson",
diff --git a/drivers/uwb/hwa-rc.c b/drivers/uwb/hwa-rc.c
index e7eeb63fab23..b409c228f254 100644
--- a/drivers/uwb/hwa-rc.c
+++ b/drivers/uwb/hwa-rc.c
@@ -891,7 +891,7 @@ static int hwarc_post_reset(struct usb_interface *iface)
891} 891}
892 892
893/** USB device ID's that we handle */ 893/** USB device ID's that we handle */
894static struct usb_device_id hwarc_id_table[] = { 894static const struct usb_device_id hwarc_id_table[] = {
895 /* D-Link DUB-1210 */ 895 /* D-Link DUB-1210 */
896 { USB_DEVICE_AND_INTERFACE_INFO(0x07d1, 0x3d02, 0xe0, 0x01, 0x02), 896 { USB_DEVICE_AND_INTERFACE_INFO(0x07d1, 0x3d02, 0xe0, 0x01, 0x02),
897 .driver_info = WUSB_QUIRK_WHCI_CMD_EVT }, 897 .driver_info = WUSB_QUIRK_WHCI_CMD_EVT },
diff --git a/drivers/uwb/i1480/dfu/usb.c b/drivers/uwb/i1480/dfu/usb.c
index 0bb665a0c024..a99e211a1b87 100644
--- a/drivers/uwb/i1480/dfu/usb.c
+++ b/drivers/uwb/i1480/dfu/usb.c
@@ -120,8 +120,7 @@ int i1480_usb_write(struct i1480 *i1480, u32 memory_address,
120 result = usb_control_msg( 120 result = usb_control_msg(
121 i1480_usb->usb_dev, usb_sndctrlpipe(i1480_usb->usb_dev, 0), 121 i1480_usb->usb_dev, usb_sndctrlpipe(i1480_usb->usb_dev, 0),
122 0xf0, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE, 122 0xf0, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE,
123 cpu_to_le16(memory_address & 0xffff), 123 memory_address, (memory_address >> 16),
124 cpu_to_le16((memory_address >> 16) & 0xffff),
125 i1480->cmd_buf, buffer_size, 100 /* FIXME: arbitrary */); 124 i1480->cmd_buf, buffer_size, 100 /* FIXME: arbitrary */);
126 if (result < 0) 125 if (result < 0)
127 break; 126 break;
@@ -166,8 +165,7 @@ int i1480_usb_read(struct i1480 *i1480, u32 addr, size_t size)
166 result = usb_control_msg( 165 result = usb_control_msg(
167 i1480_usb->usb_dev, usb_rcvctrlpipe(i1480_usb->usb_dev, 0), 166 i1480_usb->usb_dev, usb_rcvctrlpipe(i1480_usb->usb_dev, 0),
168 0xf0, USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE, 167 0xf0, USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE,
169 cpu_to_le16(itr_addr & 0xffff), 168 itr_addr, (itr_addr >> 16),
170 cpu_to_le16((itr_addr >> 16) & 0xffff),
171 i1480->cmd_buf + itr, itr_size, 169 i1480->cmd_buf + itr, itr_size,
172 100 /* FIXME: arbitrary */); 170 100 /* FIXME: arbitrary */);
173 if (result < 0) { 171 if (result < 0) {
@@ -413,6 +411,10 @@ error:
413 return result; 411 return result;
414} 412}
415 413
414MODULE_FIRMWARE("i1480-pre-phy-0.0.bin");
415MODULE_FIRMWARE("i1480-usb-0.0.bin");
416MODULE_FIRMWARE("i1480-phy-0.0.bin");
417
416#define i1480_USB_DEV(v, p) \ 418#define i1480_USB_DEV(v, p) \
417{ \ 419{ \
418 .match_flags = USB_DEVICE_ID_MATCH_DEVICE \ 420 .match_flags = USB_DEVICE_ID_MATCH_DEVICE \
@@ -430,7 +432,7 @@ error:
430 432
431 433
432/** USB device ID's that we handle */ 434/** USB device ID's that we handle */
433static struct usb_device_id i1480_usb_id_table[] = { 435static const struct usb_device_id i1480_usb_id_table[] = {
434 i1480_USB_DEV(0x8086, 0xdf3b), 436 i1480_USB_DEV(0x8086, 0xdf3b),
435 i1480_USB_DEV(0x15a9, 0x0005), 437 i1480_USB_DEV(0x15a9, 0x0005),
436 i1480_USB_DEV(0x07d1, 0x3802), 438 i1480_USB_DEV(0x07d1, 0x3802),
diff --git a/drivers/uwb/wlp/messages.c b/drivers/uwb/wlp/messages.c
index aa42fcee4c4f..75164866c2d8 100644
--- a/drivers/uwb/wlp/messages.c
+++ b/drivers/uwb/wlp/messages.c
@@ -259,6 +259,63 @@ out:
259} 259}
260 260
261 261
262static ssize_t wlp_get_attribute(struct wlp *wlp, u16 type_code,
263 struct wlp_attr_hdr *attr_hdr, void *value, ssize_t value_len,
264 ssize_t buflen)
265{
266 struct device *dev = &wlp->rc->uwb_dev.dev;
267 ssize_t attr_len = sizeof(*attr_hdr) + value_len;
268 if (buflen < 0)
269 return -EINVAL;
270 if (buflen < attr_len) {
271 dev_err(dev, "WLP: Not enough space in buffer to parse"
272 " attribute field. Need %d, received %zu\n",
273 (int)attr_len, buflen);
274 return -EIO;
275 }
276 if (wlp_check_attr_hdr(wlp, attr_hdr, type_code, value_len) < 0) {
277 dev_err(dev, "WLP: Header verification failed. \n");
278 return -EINVAL;
279 }
280 memcpy(value, (void *)attr_hdr + sizeof(*attr_hdr), value_len);
281 return attr_len;
282}
283
284static ssize_t wlp_vget_attribute(struct wlp *wlp, u16 type_code,
285 struct wlp_attr_hdr *attr_hdr, void *value, ssize_t max_value_len,
286 ssize_t buflen)
287{
288 struct device *dev = &wlp->rc->uwb_dev.dev;
289 size_t len;
290 if (buflen < 0)
291 return -EINVAL;
292 if (buflen < sizeof(*attr_hdr)) {
293 dev_err(dev, "WLP: Not enough space in buffer to parse"
294 " header.\n");
295 return -EIO;
296 }
297 if (le16_to_cpu(attr_hdr->type) != type_code) {
298 dev_err(dev, "WLP: Unexpected attribute type. Got %u, "
299 "expected %u.\n", le16_to_cpu(attr_hdr->type),
300 type_code);
301 return -EINVAL;
302 }
303 len = le16_to_cpu(attr_hdr->length);
304 if (len > max_value_len) {
305 dev_err(dev, "WLP: Attribute larger than maximum "
306 "allowed. Received %zu, max is %d.\n", len,
307 (int)max_value_len);
308 return -EFBIG;
309 }
310 if (buflen < sizeof(*attr_hdr) + len) {
311 dev_err(dev, "WLP: Not enough space in buffer to parse "
312 "variable data.\n");
313 return -EIO;
314 }
315 memcpy(value, (void *)attr_hdr + sizeof(*attr_hdr), len);
316 return sizeof(*attr_hdr) + len;
317}
318
262/** 319/**
263 * Get value of attribute from fixed size attribute field. 320 * Get value of attribute from fixed size attribute field.
264 * 321 *
@@ -274,22 +331,8 @@ out:
274ssize_t wlp_get_##name(struct wlp *wlp, struct wlp_attr_##name *attr, \ 331ssize_t wlp_get_##name(struct wlp *wlp, struct wlp_attr_##name *attr, \
275 type *value, ssize_t buflen) \ 332 type *value, ssize_t buflen) \
276{ \ 333{ \
277 struct device *dev = &wlp->rc->uwb_dev.dev; \ 334 return wlp_get_attribute(wlp, (type_code), &attr->hdr, \
278 if (buflen < 0) \ 335 value, sizeof(*value), buflen); \
279 return -EINVAL; \
280 if (buflen < sizeof(*attr)) { \
281 dev_err(dev, "WLP: Not enough space in buffer to parse" \
282 " attribute field. Need %d, received %zu\n", \
283 (int)sizeof(*attr), buflen); \
284 return -EIO; \
285 } \
286 if (wlp_check_attr_hdr(wlp, &attr->hdr, type_code, \
287 sizeof(attr->name)) < 0) { \
288 dev_err(dev, "WLP: Header verification failed. \n"); \
289 return -EINVAL; \
290 } \
291 *value = attr->name; \
292 return sizeof(*attr); \
293} 336}
294 337
295#define wlp_get_sparse(type, type_code, name) \ 338#define wlp_get_sparse(type, type_code, name) \
@@ -313,35 +356,8 @@ static ssize_t wlp_get_##name(struct wlp *wlp, \
313 struct wlp_attr_##name *attr, \ 356 struct wlp_attr_##name *attr, \
314 type_val *value, ssize_t buflen) \ 357 type_val *value, ssize_t buflen) \
315{ \ 358{ \
316 struct device *dev = &wlp->rc->uwb_dev.dev; \ 359 return wlp_vget_attribute(wlp, (type_code), &attr->hdr, \
317 size_t len; \ 360 value, (max), buflen); \
318 if (buflen < 0) \
319 return -EINVAL; \
320 if (buflen < sizeof(*attr)) { \
321 dev_err(dev, "WLP: Not enough space in buffer to parse" \
322 " header.\n"); \
323 return -EIO; \
324 } \
325 if (le16_to_cpu(attr->hdr.type) != type_code) { \
326 dev_err(dev, "WLP: Unexpected attribute type. Got %u, " \
327 "expected %u.\n", le16_to_cpu(attr->hdr.type), \
328 type_code); \
329 return -EINVAL; \
330 } \
331 len = le16_to_cpu(attr->hdr.length); \
332 if (len > max) { \
333 dev_err(dev, "WLP: Attribute larger than maximum " \
334 "allowed. Received %zu, max is %d.\n", len, \
335 (int)max); \
336 return -EFBIG; \
337 } \
338 if (buflen < sizeof(*attr) + len) { \
339 dev_err(dev, "WLP: Not enough space in buffer to parse "\
340 "variable data.\n"); \
341 return -EIO; \
342 } \
343 memcpy(value, (void *) attr + sizeof(*attr), len); \
344 return sizeof(*attr) + len; \
345} 361}
346 362
347wlp_get(u8, WLP_ATTR_WLP_VER, version) 363wlp_get(u8, WLP_ATTR_WLP_VER, version)
diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index ad37da2b6cb5..a6a88dfd5029 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -125,7 +125,7 @@ static void handle_tx(struct vhost_net *net)
125 mutex_lock(&vq->mutex); 125 mutex_lock(&vq->mutex);
126 vhost_disable_notify(vq); 126 vhost_disable_notify(vq);
127 127
128 if (wmem < sock->sk->sk_sndbuf * 2) 128 if (wmem < sock->sk->sk_sndbuf / 2)
129 tx_poll_stop(net); 129 tx_poll_stop(net);
130 hdr_size = vq->hdr_size; 130 hdr_size = vq->hdr_size;
131 131
@@ -508,12 +508,12 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
508 /* Verify that ring has been setup correctly. */ 508 /* Verify that ring has been setup correctly. */
509 if (!vhost_vq_access_ok(vq)) { 509 if (!vhost_vq_access_ok(vq)) {
510 r = -EFAULT; 510 r = -EFAULT;
511 goto err; 511 goto err_vq;
512 } 512 }
513 sock = get_socket(fd); 513 sock = get_socket(fd);
514 if (IS_ERR(sock)) { 514 if (IS_ERR(sock)) {
515 r = PTR_ERR(sock); 515 r = PTR_ERR(sock);
516 goto err; 516 goto err_vq;
517 } 517 }
518 518
519 /* start polling new socket */ 519 /* start polling new socket */
@@ -524,12 +524,14 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
524 vhost_net_disable_vq(n, vq); 524 vhost_net_disable_vq(n, vq);
525 rcu_assign_pointer(vq->private_data, sock); 525 rcu_assign_pointer(vq->private_data, sock);
526 vhost_net_enable_vq(n, vq); 526 vhost_net_enable_vq(n, vq);
527 mutex_unlock(&vq->mutex);
528done: 527done:
529 if (oldsock) { 528 if (oldsock) {
530 vhost_net_flush_vq(n, index); 529 vhost_net_flush_vq(n, index);
531 fput(oldsock->file); 530 fput(oldsock->file);
532 } 531 }
532
533err_vq:
534 mutex_unlock(&vq->mutex);
533err: 535err:
534 mutex_unlock(&n->dev.mutex); 536 mutex_unlock(&n->dev.mutex);
535 return r; 537 return r;
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 7cd55e078794..7bd7a1e4409d 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -476,8 +476,10 @@ static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp)
476 if (r < 0) 476 if (r < 0)
477 break; 477 break;
478 eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd); 478 eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd);
479 if (IS_ERR(eventfp)) 479 if (IS_ERR(eventfp)) {
480 return PTR_ERR(eventfp); 480 r = PTR_ERR(eventfp);
481 break;
482 }
481 if (eventfp != vq->kick) { 483 if (eventfp != vq->kick) {
482 pollstop = filep = vq->kick; 484 pollstop = filep = vq->kick;
483 pollstart = vq->kick = eventfp; 485 pollstart = vq->kick = eventfp;
@@ -489,8 +491,10 @@ static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp)
489 if (r < 0) 491 if (r < 0)
490 break; 492 break;
491 eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd); 493 eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd);
492 if (IS_ERR(eventfp)) 494 if (IS_ERR(eventfp)) {
493 return PTR_ERR(eventfp); 495 r = PTR_ERR(eventfp);
496 break;
497 }
494 if (eventfp != vq->call) { 498 if (eventfp != vq->call) {
495 filep = vq->call; 499 filep = vq->call;
496 ctx = vq->call_ctx; 500 ctx = vq->call_ctx;
@@ -505,8 +509,10 @@ static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp)
505 if (r < 0) 509 if (r < 0)
506 break; 510 break;
507 eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd); 511 eventfp = f.fd == -1 ? NULL : eventfd_fget(f.fd);
508 if (IS_ERR(eventfp)) 512 if (IS_ERR(eventfp)) {
509 return PTR_ERR(eventfp); 513 r = PTR_ERR(eventfp);
514 break;
515 }
510 if (eventfp != vq->error) { 516 if (eventfp != vq->error) {
511 filep = vq->error; 517 filep = vq->error;
512 vq->error = eventfp; 518 vq->error = eventfp;
diff --git a/drivers/video/geode/lxfb.h b/drivers/video/geode/lxfb.h
index cc781c00f75d..e4c4d89b7860 100644
--- a/drivers/video/geode/lxfb.h
+++ b/drivers/video/geode/lxfb.h
@@ -365,6 +365,8 @@ enum fp_registers {
365 FP_CRC, /* 0x458 */ 365 FP_CRC, /* 0x458 */
366}; 366};
367 367
368#define FP_PT2_HSP (1 << 22)
369#define FP_PT2_VSP (1 << 23)
368#define FP_PT2_SCRC (1 << 27) /* shfclk free */ 370#define FP_PT2_SCRC (1 << 27) /* shfclk free */
369 371
370#define FP_PM_P (1 << 24) /* panel power ctl */ 372#define FP_PM_P (1 << 24) /* panel power ctl */
diff --git a/drivers/video/geode/lxfb_ops.c b/drivers/video/geode/lxfb_ops.c
index 0e5d8c7c3eba..bc35a95e59d4 100644
--- a/drivers/video/geode/lxfb_ops.c
+++ b/drivers/video/geode/lxfb_ops.c
@@ -274,7 +274,15 @@ static void lx_graphics_enable(struct fb_info *info)
274 u32 msrlo, msrhi; 274 u32 msrlo, msrhi;
275 275
276 write_fp(par, FP_PT1, 0); 276 write_fp(par, FP_PT1, 0);
277 write_fp(par, FP_PT2, FP_PT2_SCRC); 277 temp = FP_PT2_SCRC;
278
279 if (info->var.sync & FB_SYNC_HOR_HIGH_ACT)
280 temp |= FP_PT2_HSP;
281
282 if (info->var.sync & FB_SYNC_VERT_HIGH_ACT)
283 temp |= FP_PT2_VSP;
284
285 write_fp(par, FP_PT2, temp);
278 write_fp(par, FP_DFC, FP_DFC_BC); 286 write_fp(par, FP_DFC, FP_DFC_BC);
279 287
280 msrlo = MSR_LX_MSR_PADSEL_TFT_SEL_LOW; 288 msrlo = MSR_LX_MSR_PADSEL_TFT_SEL_LOW;
diff --git a/drivers/video/omap2/displays/panel-generic.c b/drivers/video/omap2/displays/panel-generic.c
index c59e4baed8b2..300eff5de1b4 100644
--- a/drivers/video/omap2/displays/panel-generic.c
+++ b/drivers/video/omap2/displays/panel-generic.c
@@ -116,6 +116,24 @@ static int generic_panel_resume(struct omap_dss_device *dssdev)
116 return 0; 116 return 0;
117} 117}
118 118
119static void generic_panel_set_timings(struct omap_dss_device *dssdev,
120 struct omap_video_timings *timings)
121{
122 dpi_set_timings(dssdev, timings);
123}
124
125static void generic_panel_get_timings(struct omap_dss_device *dssdev,
126 struct omap_video_timings *timings)
127{
128 *timings = dssdev->panel.timings;
129}
130
131static int generic_panel_check_timings(struct omap_dss_device *dssdev,
132 struct omap_video_timings *timings)
133{
134 return dpi_check_timings(dssdev, timings);
135}
136
119static struct omap_dss_driver generic_driver = { 137static struct omap_dss_driver generic_driver = {
120 .probe = generic_panel_probe, 138 .probe = generic_panel_probe,
121 .remove = generic_panel_remove, 139 .remove = generic_panel_remove,
@@ -125,6 +143,10 @@ static struct omap_dss_driver generic_driver = {
125 .suspend = generic_panel_suspend, 143 .suspend = generic_panel_suspend,
126 .resume = generic_panel_resume, 144 .resume = generic_panel_resume,
127 145
146 .set_timings = generic_panel_set_timings,
147 .get_timings = generic_panel_get_timings,
148 .check_timings = generic_panel_check_timings,
149
128 .driver = { 150 .driver = {
129 .name = "generic_panel", 151 .name = "generic_panel",
130 .owner = THIS_MODULE, 152 .owner = THIS_MODULE,
diff --git a/drivers/video/omap2/dss/dss.c b/drivers/video/omap2/dss/dss.c
index 8254a4232a53..54344184dd73 100644
--- a/drivers/video/omap2/dss/dss.c
+++ b/drivers/video/omap2/dss/dss.c
@@ -590,6 +590,9 @@ int dss_init(bool skip_init)
590 } 590 }
591 } 591 }
592 592
593 dss.dsi_clk_source = DSS_SRC_DSS1_ALWON_FCLK;
594 dss.dispc_clk_source = DSS_SRC_DSS1_ALWON_FCLK;
595
593 dss_save_context(); 596 dss_save_context();
594 597
595 rev = dss_read_reg(DSS_REVISION); 598 rev = dss_read_reg(DSS_REVISION);
diff --git a/drivers/video/omap2/vram.c b/drivers/video/omap2/vram.c
index 55a4de5e5d10..b266ffae0bde 100644
--- a/drivers/video/omap2/vram.c
+++ b/drivers/video/omap2/vram.c
@@ -511,13 +511,14 @@ static u32 omap_vram_sdram_size __initdata;
511static u32 omap_vram_def_sdram_size __initdata; 511static u32 omap_vram_def_sdram_size __initdata;
512static u32 omap_vram_def_sdram_start __initdata; 512static u32 omap_vram_def_sdram_start __initdata;
513 513
514static void __init omap_vram_early_vram(char **p) 514static int __init omap_vram_early_vram(char *p)
515{ 515{
516 omap_vram_def_sdram_size = memparse(*p, p); 516 omap_vram_def_sdram_size = memparse(p, &p);
517 if (**p == ',') 517 if (*p == ',')
518 omap_vram_def_sdram_start = simple_strtoul((*p) + 1, p, 16); 518 omap_vram_def_sdram_start = simple_strtoul(p + 1, &p, 16);
519 return 0;
519} 520}
520__early_param("vram=", omap_vram_early_vram); 521early_param("vram", omap_vram_early_vram);
521 522
522/* 523/*
523 * Called from map_io. We need to call to this early enough so that we 524 * Called from map_io. We need to call to this early enough so that we
diff --git a/drivers/video/pxa168fb.c b/drivers/video/pxa168fb.c
index 75285d3f393c..c91a7f70f7b0 100644
--- a/drivers/video/pxa168fb.c
+++ b/drivers/video/pxa168fb.c
@@ -668,7 +668,7 @@ static int __init pxa168fb_probe(struct platform_device *pdev)
668 /* 668 /*
669 * Map LCD controller registers. 669 * Map LCD controller registers.
670 */ 670 */
671 fbi->reg_base = ioremap_nocache(res->start, res->end - res->start); 671 fbi->reg_base = ioremap_nocache(res->start, resource_size(res));
672 if (fbi->reg_base == NULL) { 672 if (fbi->reg_base == NULL) {
673 ret = -ENOMEM; 673 ret = -ENOMEM;
674 goto failed; 674 goto failed;
diff --git a/fs/Kconfig b/fs/Kconfig
index 7405f071be67..5f85b5947613 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -235,6 +235,7 @@ config NFS_COMMON
235 235
236source "net/sunrpc/Kconfig" 236source "net/sunrpc/Kconfig"
237source "fs/smbfs/Kconfig" 237source "fs/smbfs/Kconfig"
238source "fs/ceph/Kconfig"
238source "fs/cifs/Kconfig" 239source "fs/cifs/Kconfig"
239source "fs/ncpfs/Kconfig" 240source "fs/ncpfs/Kconfig"
240source "fs/coda/Kconfig" 241source "fs/coda/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index c3633aa46911..97f340f14ba2 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -125,3 +125,4 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2/
125obj-$(CONFIG_BTRFS_FS) += btrfs/ 125obj-$(CONFIG_BTRFS_FS) += btrfs/
126obj-$(CONFIG_GFS2_FS) += gfs2/ 126obj-$(CONFIG_GFS2_FS) += gfs2/
127obj-$(CONFIG_EXOFS_FS) += exofs/ 127obj-$(CONFIG_EXOFS_FS) += exofs/
128obj-$(CONFIG_CEPH_FS) += ceph/
diff --git a/fs/afs/security.c b/fs/afs/security.c
index 3ef504370034..bb4ed144d0e4 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -189,8 +189,9 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key, long acl_order)
189 if (!permits) 189 if (!permits)
190 goto out_unlock; 190 goto out_unlock;
191 191
192 memcpy(permits->permits, xpermits->permits, 192 if (xpermits)
193 count * sizeof(struct afs_permit)); 193 memcpy(permits->permits, xpermits->permits,
194 count * sizeof(struct afs_permit));
194 195
195 _debug("key %x access %x", 196 _debug("key %x access %x",
196 key_serial(key), vnode->status.caller_access); 197 key_serial(key), vnode->status.caller_access);
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 15d80bb35d6f..9b6aef0f75e5 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -75,14 +75,16 @@ static int aout_core_dump(struct coredump_params *cprm)
75 struct file *file = cprm->file; 75 struct file *file = cprm->file;
76 mm_segment_t fs; 76 mm_segment_t fs;
77 int has_dumped = 0; 77 int has_dumped = 0;
78 unsigned long dump_start, dump_size; 78 void __user *dump_start;
79 int dump_size;
79 struct user dump; 80 struct user dump;
80#ifdef __alpha__ 81#ifdef __alpha__
81# define START_DATA(u) (u.start_data) 82# define START_DATA(u) ((void __user *)u.start_data)
82#else 83#else
83# define START_DATA(u) ((u.u_tsize << PAGE_SHIFT) + u.start_code) 84# define START_DATA(u) ((void __user *)((u.u_tsize << PAGE_SHIFT) + \
85 u.start_code))
84#endif 86#endif
85# define START_STACK(u) (u.start_stack) 87# define START_STACK(u) ((void __user *)u.start_stack)
86 88
87 fs = get_fs(); 89 fs = get_fs();
88 set_fs(KERNEL_DS); 90 set_fs(KERNEL_DS);
@@ -104,9 +106,9 @@ static int aout_core_dump(struct coredump_params *cprm)
104 106
105/* make sure we actually have a data and stack area to dump */ 107/* make sure we actually have a data and stack area to dump */
106 set_fs(USER_DS); 108 set_fs(USER_DS);
107 if (!access_ok(VERIFY_READ, (void __user *)START_DATA(dump), dump.u_dsize << PAGE_SHIFT)) 109 if (!access_ok(VERIFY_READ, START_DATA(dump), dump.u_dsize << PAGE_SHIFT))
108 dump.u_dsize = 0; 110 dump.u_dsize = 0;
109 if (!access_ok(VERIFY_READ, (void __user *)START_STACK(dump), dump.u_ssize << PAGE_SHIFT)) 111 if (!access_ok(VERIFY_READ, START_STACK(dump), dump.u_ssize << PAGE_SHIFT))
110 dump.u_ssize = 0; 112 dump.u_ssize = 0;
111 113
112 set_fs(KERNEL_DS); 114 set_fs(KERNEL_DS);
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 2c32d00a6690..7ab23e006e4c 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1590,7 +1590,7 @@ static size_t elf_core_vma_data_size(unsigned long mm_flags)
1590 struct vm_area_struct *vma; 1590 struct vm_area_struct *vma;
1591 size_t size = 0; 1591 size_t size = 0;
1592 1592
1593 for (vma = current->mm->mmap; vma; vma->vm_next) 1593 for (vma = current->mm->mmap; vma; vma = vma->vm_next)
1594 if (maydump(vma, mm_flags)) 1594 if (maydump(vma, mm_flags))
1595 size += vma->vm_end - vma->vm_start; 1595 size += vma->vm_end - vma->vm_start;
1596 return size; 1596 return size;
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
new file mode 100644
index 000000000000..04b8280582a9
--- /dev/null
+++ b/fs/ceph/Kconfig
@@ -0,0 +1,27 @@
1config CEPH_FS
2 tristate "Ceph distributed file system (EXPERIMENTAL)"
3 depends on INET && EXPERIMENTAL
4 select LIBCRC32C
5 select CONFIG_CRYPTO_AES
6 help
7 Choose Y or M here to include support for mounting the
8 experimental Ceph distributed file system. Ceph is an extremely
9 scalable file system designed to provide high performance,
10 reliable access to petabytes of storage.
11
12 More information at http://ceph.newdream.net/.
13
14 If unsure, say N.
15
16config CEPH_FS_PRETTYDEBUG
17 bool "Include file:line in ceph debug output"
18 depends on CEPH_FS
19 default n
20 help
21 If you say Y here, debug output will include a filename and
22 line to aid debugging. This icnreases kernel size and slows
23 execution slightly when debug call sites are enabled (e.g.,
24 via CONFIG_DYNAMIC_DEBUG).
25
26 If unsure, say N.
27
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
new file mode 100644
index 000000000000..6a660e610be8
--- /dev/null
+++ b/fs/ceph/Makefile
@@ -0,0 +1,39 @@
1#
2# Makefile for CEPH filesystem.
3#
4
5ifneq ($(KERNELRELEASE),)
6
7obj-$(CONFIG_CEPH_FS) += ceph.o
8
9ceph-objs := super.o inode.o dir.o file.o addr.o ioctl.o \
10 export.o caps.o snap.o xattr.o \
11 messenger.o msgpool.o buffer.o pagelist.o \
12 mds_client.o mdsmap.o \
13 mon_client.o \
14 osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
15 debugfs.o \
16 auth.o auth_none.o \
17 crypto.o armor.o \
18 auth_x.o \
19 ceph_fs.o ceph_strings.o ceph_hash.o ceph_frag.o
20
21else
22#Otherwise we were called directly from the command
23# line; invoke the kernel build system.
24
25KERNELDIR ?= /lib/modules/$(shell uname -r)/build
26PWD := $(shell pwd)
27
28default: all
29
30all:
31 $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules
32
33modules_install:
34 $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules_install
35
36clean:
37 $(MAKE) -C $(KERNELDIR) M=$(PWD) clean
38
39endif
diff --git a/fs/ceph/README b/fs/ceph/README
new file mode 100644
index 000000000000..18352fab37c0
--- /dev/null
+++ b/fs/ceph/README
@@ -0,0 +1,20 @@
1#
2# The following files are shared by (and manually synchronized
3# between) the Ceph userland and kernel client.
4#
5# userland kernel
6src/include/ceph_fs.h fs/ceph/ceph_fs.h
7src/include/ceph_fs.cc fs/ceph/ceph_fs.c
8src/include/msgr.h fs/ceph/msgr.h
9src/include/rados.h fs/ceph/rados.h
10src/include/ceph_strings.cc fs/ceph/ceph_strings.c
11src/include/ceph_frag.h fs/ceph/ceph_frag.h
12src/include/ceph_frag.cc fs/ceph/ceph_frag.c
13src/include/ceph_hash.h fs/ceph/ceph_hash.h
14src/include/ceph_hash.cc fs/ceph/ceph_hash.c
15src/crush/crush.c fs/ceph/crush/crush.c
16src/crush/crush.h fs/ceph/crush/crush.h
17src/crush/mapper.c fs/ceph/crush/mapper.c
18src/crush/mapper.h fs/ceph/crush/mapper.h
19src/crush/hash.h fs/ceph/crush/hash.h
20src/crush/hash.c fs/ceph/crush/hash.c
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
new file mode 100644
index 000000000000..23bb0ceabe31
--- /dev/null
+++ b/fs/ceph/addr.c
@@ -0,0 +1,1188 @@
1#include "ceph_debug.h"
2
3#include <linux/backing-dev.h>
4#include <linux/fs.h>
5#include <linux/mm.h>
6#include <linux/pagemap.h>
7#include <linux/writeback.h> /* generic_writepages */
8#include <linux/pagevec.h>
9#include <linux/task_io_accounting_ops.h>
10
11#include "super.h"
12#include "osd_client.h"
13
14/*
15 * Ceph address space ops.
16 *
17 * There are a few funny things going on here.
18 *
19 * The page->private field is used to reference a struct
20 * ceph_snap_context for _every_ dirty page. This indicates which
21 * snapshot the page was logically dirtied in, and thus which snap
22 * context needs to be associated with the osd write during writeback.
23 *
24 * Similarly, struct ceph_inode_info maintains a set of counters to
25 * count dirty pages on the inode. In the absense of snapshots,
26 * i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count.
27 *
28 * When a snapshot is taken (that is, when the client receives
29 * notification that a snapshot was taken), each inode with caps and
30 * with dirty pages (dirty pages implies there is a cap) gets a new
31 * ceph_cap_snap in the i_cap_snaps list (which is sorted in ascending
32 * order, new snaps go to the tail). The i_wrbuffer_ref_head count is
33 * moved to capsnap->dirty. (Unless a sync write is currently in
34 * progress. In that case, the capsnap is said to be "pending", new
35 * writes cannot start, and the capsnap isn't "finalized" until the
36 * write completes (or fails) and a final size/mtime for the inode for
37 * that snap can be settled upon.) i_wrbuffer_ref_head is reset to 0.
38 *
39 * On writeback, we must submit writes to the osd IN SNAP ORDER. So,
40 * we look for the first capsnap in i_cap_snaps and write out pages in
41 * that snap context _only_. Then we move on to the next capsnap,
42 * eventually reaching the "live" or "head" context (i.e., pages that
43 * are not yet snapped) and are writing the most recently dirtied
44 * pages.
45 *
46 * Invalidate and so forth must take care to ensure the dirty page
47 * accounting is preserved.
48 */
49
50#define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10))
51#define CONGESTION_OFF_THRESH(congestion_kb) \
52 (CONGESTION_ON_THRESH(congestion_kb) - \
53 (CONGESTION_ON_THRESH(congestion_kb) >> 2))
54
55
56
57/*
58 * Dirty a page. Optimistically adjust accounting, on the assumption
59 * that we won't race with invalidate. If we do, readjust.
60 */
61static int ceph_set_page_dirty(struct page *page)
62{
63 struct address_space *mapping = page->mapping;
64 struct inode *inode;
65 struct ceph_inode_info *ci;
66 int undo = 0;
67 struct ceph_snap_context *snapc;
68
69 if (unlikely(!mapping))
70 return !TestSetPageDirty(page);
71
72 if (TestSetPageDirty(page)) {
73 dout("%p set_page_dirty %p idx %lu -- already dirty\n",
74 mapping->host, page, page->index);
75 return 0;
76 }
77
78 inode = mapping->host;
79 ci = ceph_inode(inode);
80
81 /*
82 * Note that we're grabbing a snapc ref here without holding
83 * any locks!
84 */
85 snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
86
87 /* dirty the head */
88 spin_lock(&inode->i_lock);
89 if (ci->i_wrbuffer_ref_head == 0)
90 ci->i_head_snapc = ceph_get_snap_context(snapc);
91 ++ci->i_wrbuffer_ref_head;
92 if (ci->i_wrbuffer_ref == 0)
93 igrab(inode);
94 ++ci->i_wrbuffer_ref;
95 dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d "
96 "snapc %p seq %lld (%d snaps)\n",
97 mapping->host, page, page->index,
98 ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,
99 ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
100 snapc, snapc->seq, snapc->num_snaps);
101 spin_unlock(&inode->i_lock);
102
103 /* now adjust page */
104 spin_lock_irq(&mapping->tree_lock);
105 if (page->mapping) { /* Race with truncate? */
106 WARN_ON_ONCE(!PageUptodate(page));
107
108 if (mapping_cap_account_dirty(mapping)) {
109 __inc_zone_page_state(page, NR_FILE_DIRTY);
110 __inc_bdi_stat(mapping->backing_dev_info,
111 BDI_RECLAIMABLE);
112 task_io_account_write(PAGE_CACHE_SIZE);
113 }
114 radix_tree_tag_set(&mapping->page_tree,
115 page_index(page), PAGECACHE_TAG_DIRTY);
116
117 /*
118 * Reference snap context in page->private. Also set
119 * PagePrivate so that we get invalidatepage callback.
120 */
121 page->private = (unsigned long)snapc;
122 SetPagePrivate(page);
123 } else {
124 dout("ANON set_page_dirty %p (raced truncate?)\n", page);
125 undo = 1;
126 }
127
128 spin_unlock_irq(&mapping->tree_lock);
129
130 if (undo)
131 /* whoops, we failed to dirty the page */
132 ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
133
134 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
135
136 BUG_ON(!PageDirty(page));
137 return 1;
138}
139
140/*
141 * If we are truncating the full page (i.e. offset == 0), adjust the
142 * dirty page counters appropriately. Only called if there is private
143 * data on the page.
144 */
145static void ceph_invalidatepage(struct page *page, unsigned long offset)
146{
147 struct inode *inode;
148 struct ceph_inode_info *ci;
149 struct ceph_snap_context *snapc = (void *)page->private;
150
151 BUG_ON(!PageLocked(page));
152 BUG_ON(!page->private);
153 BUG_ON(!PagePrivate(page));
154 BUG_ON(!page->mapping);
155
156 inode = page->mapping->host;
157
158 /*
159 * We can get non-dirty pages here due to races between
160 * set_page_dirty and truncate_complete_page; just spit out a
161 * warning, in case we end up with accounting problems later.
162 */
163 if (!PageDirty(page))
164 pr_err("%p invalidatepage %p page not dirty\n", inode, page);
165
166 if (offset == 0)
167 ClearPageChecked(page);
168
169 ci = ceph_inode(inode);
170 if (offset == 0) {
171 dout("%p invalidatepage %p idx %lu full dirty page %lu\n",
172 inode, page, page->index, offset);
173 ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
174 ceph_put_snap_context(snapc);
175 page->private = 0;
176 ClearPagePrivate(page);
177 } else {
178 dout("%p invalidatepage %p idx %lu partial dirty page\n",
179 inode, page, page->index);
180 }
181}
182
183/* just a sanity check */
184static int ceph_releasepage(struct page *page, gfp_t g)
185{
186 struct inode *inode = page->mapping ? page->mapping->host : NULL;
187 dout("%p releasepage %p idx %lu\n", inode, page, page->index);
188 WARN_ON(PageDirty(page));
189 WARN_ON(page->private);
190 WARN_ON(PagePrivate(page));
191 return 0;
192}
193
194/*
195 * read a single page, without unlocking it.
196 */
197static int readpage_nounlock(struct file *filp, struct page *page)
198{
199 struct inode *inode = filp->f_dentry->d_inode;
200 struct ceph_inode_info *ci = ceph_inode(inode);
201 struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
202 int err = 0;
203 u64 len = PAGE_CACHE_SIZE;
204
205 dout("readpage inode %p file %p page %p index %lu\n",
206 inode, filp, page, page->index);
207 err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
208 page->index << PAGE_CACHE_SHIFT, &len,
209 ci->i_truncate_seq, ci->i_truncate_size,
210 &page, 1);
211 if (err == -ENOENT)
212 err = 0;
213 if (err < 0) {
214 SetPageError(page);
215 goto out;
216 } else if (err < PAGE_CACHE_SIZE) {
217 /* zero fill remainder of page */
218 zero_user_segment(page, err, PAGE_CACHE_SIZE);
219 }
220 SetPageUptodate(page);
221
222out:
223 return err < 0 ? err : 0;
224}
225
226static int ceph_readpage(struct file *filp, struct page *page)
227{
228 int r = readpage_nounlock(filp, page);
229 unlock_page(page);
230 return r;
231}
232
233/*
234 * Build a vector of contiguous pages from the provided page list.
235 */
236static struct page **page_vector_from_list(struct list_head *page_list,
237 unsigned *nr_pages)
238{
239 struct page **pages;
240 struct page *page;
241 int next_index, contig_pages = 0;
242
243 /* build page vector */
244 pages = kmalloc(sizeof(*pages) * *nr_pages, GFP_NOFS);
245 if (!pages)
246 return ERR_PTR(-ENOMEM);
247
248 BUG_ON(list_empty(page_list));
249 next_index = list_entry(page_list->prev, struct page, lru)->index;
250 list_for_each_entry_reverse(page, page_list, lru) {
251 if (page->index == next_index) {
252 dout("readpages page %d %p\n", contig_pages, page);
253 pages[contig_pages] = page;
254 contig_pages++;
255 next_index++;
256 } else {
257 break;
258 }
259 }
260 *nr_pages = contig_pages;
261 return pages;
262}
263
264/*
265 * Read multiple pages. Leave pages we don't read + unlock in page_list;
266 * the caller (VM) cleans them up.
267 */
268static int ceph_readpages(struct file *file, struct address_space *mapping,
269 struct list_head *page_list, unsigned nr_pages)
270{
271 struct inode *inode = file->f_dentry->d_inode;
272 struct ceph_inode_info *ci = ceph_inode(inode);
273 struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
274 int rc = 0;
275 struct page **pages;
276 struct pagevec pvec;
277 loff_t offset;
278 u64 len;
279
280 dout("readpages %p file %p nr_pages %d\n",
281 inode, file, nr_pages);
282
283 pages = page_vector_from_list(page_list, &nr_pages);
284 if (IS_ERR(pages))
285 return PTR_ERR(pages);
286
287 /* guess read extent */
288 offset = pages[0]->index << PAGE_CACHE_SHIFT;
289 len = nr_pages << PAGE_CACHE_SHIFT;
290 rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
291 offset, &len,
292 ci->i_truncate_seq, ci->i_truncate_size,
293 pages, nr_pages);
294 if (rc == -ENOENT)
295 rc = 0;
296 if (rc < 0)
297 goto out;
298
299 /* set uptodate and add to lru in pagevec-sized chunks */
300 pagevec_init(&pvec, 0);
301 for (; !list_empty(page_list) && len > 0;
302 rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) {
303 struct page *page =
304 list_entry(page_list->prev, struct page, lru);
305
306 list_del(&page->lru);
307
308 if (rc < (int)PAGE_CACHE_SIZE) {
309 /* zero (remainder of) page */
310 int s = rc < 0 ? 0 : rc;
311 zero_user_segment(page, s, PAGE_CACHE_SIZE);
312 }
313
314 if (add_to_page_cache(page, mapping, page->index, GFP_NOFS)) {
315 page_cache_release(page);
316 dout("readpages %p add_to_page_cache failed %p\n",
317 inode, page);
318 continue;
319 }
320 dout("readpages %p adding %p idx %lu\n", inode, page,
321 page->index);
322 flush_dcache_page(page);
323 SetPageUptodate(page);
324 unlock_page(page);
325 if (pagevec_add(&pvec, page) == 0)
326 pagevec_lru_add_file(&pvec); /* add to lru */
327 }
328 pagevec_lru_add_file(&pvec);
329 rc = 0;
330
331out:
332 kfree(pages);
333 return rc;
334}
335
336/*
337 * Get ref for the oldest snapc for an inode with dirty data... that is, the
338 * only snap context we are allowed to write back.
339 *
340 * Caller holds i_lock.
341 */
342static struct ceph_snap_context *__get_oldest_context(struct inode *inode,
343 u64 *snap_size)
344{
345 struct ceph_inode_info *ci = ceph_inode(inode);
346 struct ceph_snap_context *snapc = NULL;
347 struct ceph_cap_snap *capsnap = NULL;
348
349 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
350 dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap,
351 capsnap->context, capsnap->dirty_pages);
352 if (capsnap->dirty_pages) {
353 snapc = ceph_get_snap_context(capsnap->context);
354 if (snap_size)
355 *snap_size = capsnap->size;
356 break;
357 }
358 }
359 if (!snapc && ci->i_snap_realm) {
360 snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
361 dout(" head snapc %p has %d dirty pages\n",
362 snapc, ci->i_wrbuffer_ref_head);
363 }
364 return snapc;
365}
366
367static struct ceph_snap_context *get_oldest_context(struct inode *inode,
368 u64 *snap_size)
369{
370 struct ceph_snap_context *snapc = NULL;
371
372 spin_lock(&inode->i_lock);
373 snapc = __get_oldest_context(inode, snap_size);
374 spin_unlock(&inode->i_lock);
375 return snapc;
376}
377
378/*
379 * Write a single page, but leave the page locked.
380 *
381 * If we get a write error, set the page error bit, but still adjust the
382 * dirty page accounting (i.e., page is no longer dirty).
383 */
384static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
385{
386 struct inode *inode;
387 struct ceph_inode_info *ci;
388 struct ceph_client *client;
389 struct ceph_osd_client *osdc;
390 loff_t page_off = page->index << PAGE_CACHE_SHIFT;
391 int len = PAGE_CACHE_SIZE;
392 loff_t i_size;
393 int err = 0;
394 struct ceph_snap_context *snapc;
395 u64 snap_size = 0;
396 long writeback_stat;
397
398 dout("writepage %p idx %lu\n", page, page->index);
399
400 if (!page->mapping || !page->mapping->host) {
401 dout("writepage %p - no mapping\n", page);
402 return -EFAULT;
403 }
404 inode = page->mapping->host;
405 ci = ceph_inode(inode);
406 client = ceph_inode_to_client(inode);
407 osdc = &client->osdc;
408
409 /* verify this is a writeable snap context */
410 snapc = (void *)page->private;
411 if (snapc == NULL) {
412 dout("writepage %p page %p not dirty?\n", inode, page);
413 goto out;
414 }
415 if (snapc != get_oldest_context(inode, &snap_size)) {
416 dout("writepage %p page %p snapc %p not writeable - noop\n",
417 inode, page, (void *)page->private);
418 /* we should only noop if called by kswapd */
419 WARN_ON((current->flags & PF_MEMALLOC) == 0);
420 goto out;
421 }
422
423 /* is this a partial page at end of file? */
424 if (snap_size)
425 i_size = snap_size;
426 else
427 i_size = i_size_read(inode);
428 if (i_size < page_off + len)
429 len = i_size - page_off;
430
431 dout("writepage %p page %p index %lu on %llu~%u\n",
432 inode, page, page->index, page_off, len);
433
434 writeback_stat = atomic_long_inc_return(&client->writeback_count);
435 if (writeback_stat >
436 CONGESTION_ON_THRESH(client->mount_args->congestion_kb))
437 set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
438
439 set_page_writeback(page);
440 err = ceph_osdc_writepages(osdc, ceph_vino(inode),
441 &ci->i_layout, snapc,
442 page_off, len,
443 ci->i_truncate_seq, ci->i_truncate_size,
444 &inode->i_mtime,
445 &page, 1, 0, 0, true);
446 if (err < 0) {
447 dout("writepage setting page/mapping error %d %p\n", err, page);
448 SetPageError(page);
449 mapping_set_error(&inode->i_data, err);
450 if (wbc)
451 wbc->pages_skipped++;
452 } else {
453 dout("writepage cleaned page %p\n", page);
454 err = 0; /* vfs expects us to return 0 */
455 }
456 page->private = 0;
457 ClearPagePrivate(page);
458 end_page_writeback(page);
459 ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
460 ceph_put_snap_context(snapc);
461out:
462 return err;
463}
464
465static int ceph_writepage(struct page *page, struct writeback_control *wbc)
466{
467 int err;
468 struct inode *inode = page->mapping->host;
469 BUG_ON(!inode);
470 igrab(inode);
471 err = writepage_nounlock(page, wbc);
472 unlock_page(page);
473 iput(inode);
474 return err;
475}
476
477
478/*
479 * lame release_pages helper. release_pages() isn't exported to
480 * modules.
481 */
482static void ceph_release_pages(struct page **pages, int num)
483{
484 struct pagevec pvec;
485 int i;
486
487 pagevec_init(&pvec, 0);
488 for (i = 0; i < num; i++) {
489 if (pagevec_add(&pvec, pages[i]) == 0)
490 pagevec_release(&pvec);
491 }
492 pagevec_release(&pvec);
493}
494
495
496/*
497 * async writeback completion handler.
498 *
499 * If we get an error, set the mapping error bit, but not the individual
500 * page error bits.
501 */
502static void writepages_finish(struct ceph_osd_request *req,
503 struct ceph_msg *msg)
504{
505 struct inode *inode = req->r_inode;
506 struct ceph_osd_reply_head *replyhead;
507 struct ceph_osd_op *op;
508 struct ceph_inode_info *ci = ceph_inode(inode);
509 unsigned wrote;
510 struct page *page;
511 int i;
512 struct ceph_snap_context *snapc = req->r_snapc;
513 struct address_space *mapping = inode->i_mapping;
514 struct writeback_control *wbc = req->r_wbc;
515 __s32 rc = -EIO;
516 u64 bytes = 0;
517 struct ceph_client *client = ceph_inode_to_client(inode);
518 long writeback_stat;
519 unsigned issued = __ceph_caps_issued(ci, NULL);
520
521 /* parse reply */
522 replyhead = msg->front.iov_base;
523 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
524 op = (void *)(replyhead + 1);
525 rc = le32_to_cpu(replyhead->result);
526 bytes = le64_to_cpu(op->extent.length);
527
528 if (rc >= 0) {
529 /*
530 * Assume we wrote the pages we originally sent. The
531 * osd might reply with fewer pages if our writeback
532 * raced with a truncation and was adjusted at the osd,
533 * so don't believe the reply.
534 */
535 wrote = req->r_num_pages;
536 } else {
537 wrote = 0;
538 mapping_set_error(mapping, rc);
539 }
540 dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n",
541 inode, rc, bytes, wrote);
542
543 /* clean all pages */
544 for (i = 0; i < req->r_num_pages; i++) {
545 page = req->r_pages[i];
546 BUG_ON(!page);
547 WARN_ON(!PageUptodate(page));
548
549 writeback_stat =
550 atomic_long_dec_return(&client->writeback_count);
551 if (writeback_stat <
552 CONGESTION_OFF_THRESH(client->mount_args->congestion_kb))
553 clear_bdi_congested(&client->backing_dev_info,
554 BLK_RW_ASYNC);
555
556 if (i >= wrote) {
557 dout("inode %p skipping page %p\n", inode, page);
558 wbc->pages_skipped++;
559 }
560 page->private = 0;
561 ClearPagePrivate(page);
562 ceph_put_snap_context(snapc);
563 dout("unlocking %d %p\n", i, page);
564 end_page_writeback(page);
565
566 /*
567 * We lost the cache cap, need to truncate the page before
568 * it is unlocked, otherwise we'd truncate it later in the
569 * page truncation thread, possibly losing some data that
570 * raced its way in
571 */
572 if ((issued & CEPH_CAP_FILE_CACHE) == 0)
573 generic_error_remove_page(inode->i_mapping, page);
574
575 unlock_page(page);
576 }
577 dout("%p wrote+cleaned %d pages\n", inode, wrote);
578 ceph_put_wrbuffer_cap_refs(ci, req->r_num_pages, snapc);
579
580 ceph_release_pages(req->r_pages, req->r_num_pages);
581 if (req->r_pages_from_pool)
582 mempool_free(req->r_pages,
583 ceph_client(inode->i_sb)->wb_pagevec_pool);
584 else
585 kfree(req->r_pages);
586 ceph_osdc_put_request(req);
587}
588
589/*
590 * allocate a page vec, either directly, or if necessary, via a the
591 * mempool. we avoid the mempool if we can because req->r_num_pages
592 * may be less than the maximum write size.
593 */
594static void alloc_page_vec(struct ceph_client *client,
595 struct ceph_osd_request *req)
596{
597 req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages,
598 GFP_NOFS);
599 if (!req->r_pages) {
600 req->r_pages = mempool_alloc(client->wb_pagevec_pool, GFP_NOFS);
601 req->r_pages_from_pool = 1;
602 WARN_ON(!req->r_pages);
603 }
604}
605
606/*
607 * initiate async writeback
608 */
609static int ceph_writepages_start(struct address_space *mapping,
610 struct writeback_control *wbc)
611{
612 struct inode *inode = mapping->host;
613 struct backing_dev_info *bdi = mapping->backing_dev_info;
614 struct ceph_inode_info *ci = ceph_inode(inode);
615 struct ceph_client *client;
616 pgoff_t index, start, end;
617 int range_whole = 0;
618 int should_loop = 1;
619 pgoff_t max_pages = 0, max_pages_ever = 0;
620 struct ceph_snap_context *snapc = NULL, *last_snapc = NULL;
621 struct pagevec pvec;
622 int done = 0;
623 int rc = 0;
624 unsigned wsize = 1 << inode->i_blkbits;
625 struct ceph_osd_request *req = NULL;
626 int do_sync;
627 u64 snap_size = 0;
628
629 /*
630 * Include a 'sync' in the OSD request if this is a data
631 * integrity write (e.g., O_SYNC write or fsync()), or if our
632 * cap is being revoked.
633 */
634 do_sync = wbc->sync_mode == WB_SYNC_ALL;
635 if (ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER))
636 do_sync = 1;
637 dout("writepages_start %p dosync=%d (mode=%s)\n",
638 inode, do_sync,
639 wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
640 (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
641
642 client = ceph_inode_to_client(inode);
643 if (client->mount_state == CEPH_MOUNT_SHUTDOWN) {
644 pr_warning("writepage_start %p on forced umount\n", inode);
645 return -EIO; /* we're in a forced umount, don't write! */
646 }
647 if (client->mount_args->wsize && client->mount_args->wsize < wsize)
648 wsize = client->mount_args->wsize;
649 if (wsize < PAGE_CACHE_SIZE)
650 wsize = PAGE_CACHE_SIZE;
651 max_pages_ever = wsize >> PAGE_CACHE_SHIFT;
652
653 pagevec_init(&pvec, 0);
654
655 /* ?? */
656 if (wbc->nonblocking && bdi_write_congested(bdi)) {
657 dout(" writepages congested\n");
658 wbc->encountered_congestion = 1;
659 goto out_final;
660 }
661
662 /* where to start/end? */
663 if (wbc->range_cyclic) {
664 start = mapping->writeback_index; /* Start from prev offset */
665 end = -1;
666 dout(" cyclic, start at %lu\n", start);
667 } else {
668 start = wbc->range_start >> PAGE_CACHE_SHIFT;
669 end = wbc->range_end >> PAGE_CACHE_SHIFT;
670 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
671 range_whole = 1;
672 should_loop = 0;
673 dout(" not cyclic, %lu to %lu\n", start, end);
674 }
675 index = start;
676
677retry:
678 /* find oldest snap context with dirty data */
679 ceph_put_snap_context(snapc);
680 snapc = get_oldest_context(inode, &snap_size);
681 if (!snapc) {
682 /* hmm, why does writepages get called when there
683 is no dirty data? */
684 dout(" no snap context with dirty data?\n");
685 goto out;
686 }
687 dout(" oldest snapc is %p seq %lld (%d snaps)\n",
688 snapc, snapc->seq, snapc->num_snaps);
689 if (last_snapc && snapc != last_snapc) {
690 /* if we switched to a newer snapc, restart our scan at the
691 * start of the original file range. */
692 dout(" snapc differs from last pass, restarting at %lu\n",
693 index);
694 index = start;
695 }
696 last_snapc = snapc;
697
698 while (!done && index <= end) {
699 unsigned i;
700 int first;
701 pgoff_t next;
702 int pvec_pages, locked_pages;
703 struct page *page;
704 int want;
705 u64 offset, len;
706 struct ceph_osd_request_head *reqhead;
707 struct ceph_osd_op *op;
708 long writeback_stat;
709
710 next = 0;
711 locked_pages = 0;
712 max_pages = max_pages_ever;
713
714get_more_pages:
715 first = -1;
716 want = min(end - index,
717 min((pgoff_t)PAGEVEC_SIZE,
718 max_pages - (pgoff_t)locked_pages) - 1)
719 + 1;
720 pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index,
721 PAGECACHE_TAG_DIRTY,
722 want);
723 dout("pagevec_lookup_tag got %d\n", pvec_pages);
724 if (!pvec_pages && !locked_pages)
725 break;
726 for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) {
727 page = pvec.pages[i];
728 dout("? %p idx %lu\n", page, page->index);
729 if (locked_pages == 0)
730 lock_page(page); /* first page */
731 else if (!trylock_page(page))
732 break;
733
734 /* only dirty pages, or our accounting breaks */
735 if (unlikely(!PageDirty(page)) ||
736 unlikely(page->mapping != mapping)) {
737 dout("!dirty or !mapping %p\n", page);
738 unlock_page(page);
739 break;
740 }
741 if (!wbc->range_cyclic && page->index > end) {
742 dout("end of range %p\n", page);
743 done = 1;
744 unlock_page(page);
745 break;
746 }
747 if (next && (page->index != next)) {
748 dout("not consecutive %p\n", page);
749 unlock_page(page);
750 break;
751 }
752 if (wbc->sync_mode != WB_SYNC_NONE) {
753 dout("waiting on writeback %p\n", page);
754 wait_on_page_writeback(page);
755 }
756 if ((snap_size && page_offset(page) > snap_size) ||
757 (!snap_size &&
758 page_offset(page) > i_size_read(inode))) {
759 dout("%p page eof %llu\n", page, snap_size ?
760 snap_size : i_size_read(inode));
761 done = 1;
762 unlock_page(page);
763 break;
764 }
765 if (PageWriteback(page)) {
766 dout("%p under writeback\n", page);
767 unlock_page(page);
768 break;
769 }
770
771 /* only if matching snap context */
772 if (snapc != (void *)page->private) {
773 dout("page snapc %p != oldest %p\n",
774 (void *)page->private, snapc);
775 unlock_page(page);
776 if (!locked_pages)
777 continue; /* keep looking for snap */
778 break;
779 }
780
781 if (!clear_page_dirty_for_io(page)) {
782 dout("%p !clear_page_dirty_for_io\n", page);
783 unlock_page(page);
784 break;
785 }
786
787 /* ok */
788 if (locked_pages == 0) {
789 /* prepare async write request */
790 offset = page->index << PAGE_CACHE_SHIFT;
791 len = wsize;
792 req = ceph_osdc_new_request(&client->osdc,
793 &ci->i_layout,
794 ceph_vino(inode),
795 offset, &len,
796 CEPH_OSD_OP_WRITE,
797 CEPH_OSD_FLAG_WRITE |
798 CEPH_OSD_FLAG_ONDISK,
799 snapc, do_sync,
800 ci->i_truncate_seq,
801 ci->i_truncate_size,
802 &inode->i_mtime, true, 1);
803 max_pages = req->r_num_pages;
804
805 alloc_page_vec(client, req);
806 req->r_callback = writepages_finish;
807 req->r_inode = inode;
808 req->r_wbc = wbc;
809 }
810
811 /* note position of first page in pvec */
812 if (first < 0)
813 first = i;
814 dout("%p will write page %p idx %lu\n",
815 inode, page, page->index);
816
817 writeback_stat = atomic_long_inc_return(&client->writeback_count);
818 if (writeback_stat > CONGESTION_ON_THRESH(client->mount_args->congestion_kb)) {
819 set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
820 }
821
822 set_page_writeback(page);
823 req->r_pages[locked_pages] = page;
824 locked_pages++;
825 next = page->index + 1;
826 }
827
828 /* did we get anything? */
829 if (!locked_pages)
830 goto release_pvec_pages;
831 if (i) {
832 int j;
833 BUG_ON(!locked_pages || first < 0);
834
835 if (pvec_pages && i == pvec_pages &&
836 locked_pages < max_pages) {
837 dout("reached end pvec, trying for more\n");
838 pagevec_reinit(&pvec);
839 goto get_more_pages;
840 }
841
842 /* shift unused pages over in the pvec... we
843 * will need to release them below. */
844 for (j = i; j < pvec_pages; j++) {
845 dout(" pvec leftover page %p\n",
846 pvec.pages[j]);
847 pvec.pages[j-i+first] = pvec.pages[j];
848 }
849 pvec.nr -= i-first;
850 }
851
852 /* submit the write */
853 offset = req->r_pages[0]->index << PAGE_CACHE_SHIFT;
854 len = min((snap_size ? snap_size : i_size_read(inode)) - offset,
855 (u64)locked_pages << PAGE_CACHE_SHIFT);
856 dout("writepages got %d pages at %llu~%llu\n",
857 locked_pages, offset, len);
858
859 /* revise final length, page count */
860 req->r_num_pages = locked_pages;
861 reqhead = req->r_request->front.iov_base;
862 op = (void *)(reqhead + 1);
863 op->extent.length = cpu_to_le64(len);
864 op->payload_len = cpu_to_le32(len);
865 req->r_request->hdr.data_len = cpu_to_le32(len);
866
867 ceph_osdc_start_request(&client->osdc, req, true);
868 req = NULL;
869
870 /* continue? */
871 index = next;
872 wbc->nr_to_write -= locked_pages;
873 if (wbc->nr_to_write <= 0)
874 done = 1;
875
876release_pvec_pages:
877 dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr,
878 pvec.nr ? pvec.pages[0] : NULL);
879 pagevec_release(&pvec);
880
881 if (locked_pages && !done)
882 goto retry;
883 }
884
885 if (should_loop && !done) {
886 /* more to do; loop back to beginning of file */
887 dout("writepages looping back to beginning of file\n");
888 should_loop = 0;
889 index = 0;
890 goto retry;
891 }
892
893 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
894 mapping->writeback_index = index;
895
896out:
897 if (req)
898 ceph_osdc_put_request(req);
899 if (rc > 0)
900 rc = 0; /* vfs expects us to return 0 */
901 ceph_put_snap_context(snapc);
902 dout("writepages done, rc = %d\n", rc);
903out_final:
904 return rc;
905}
906
907
908
909/*
910 * See if a given @snapc is either writeable, or already written.
911 */
912static int context_is_writeable_or_written(struct inode *inode,
913 struct ceph_snap_context *snapc)
914{
915 struct ceph_snap_context *oldest = get_oldest_context(inode, NULL);
916 return !oldest || snapc->seq <= oldest->seq;
917}
918
919/*
920 * We are only allowed to write into/dirty the page if the page is
921 * clean, or already dirty within the same snap context.
922 */
923static int ceph_update_writeable_page(struct file *file,
924 loff_t pos, unsigned len,
925 struct page *page)
926{
927 struct inode *inode = file->f_dentry->d_inode;
928 struct ceph_inode_info *ci = ceph_inode(inode);
929 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
930 loff_t page_off = pos & PAGE_CACHE_MASK;
931 int pos_in_page = pos & ~PAGE_CACHE_MASK;
932 int end_in_page = pos_in_page + len;
933 loff_t i_size;
934 struct ceph_snap_context *snapc;
935 int r;
936
937retry_locked:
938 /* writepages currently holds page lock, but if we change that later, */
939 wait_on_page_writeback(page);
940
941 /* check snap context */
942 BUG_ON(!ci->i_snap_realm);
943 down_read(&mdsc->snap_rwsem);
944 BUG_ON(!ci->i_snap_realm->cached_context);
945 if (page->private &&
946 (void *)page->private != ci->i_snap_realm->cached_context) {
947 /*
948 * this page is already dirty in another (older) snap
949 * context! is it writeable now?
950 */
951 snapc = get_oldest_context(inode, NULL);
952 up_read(&mdsc->snap_rwsem);
953
954 if (snapc != (void *)page->private) {
955 dout(" page %p snapc %p not current or oldest\n",
956 page, (void *)page->private);
957 /*
958 * queue for writeback, and wait for snapc to
959 * be writeable or written
960 */
961 snapc = ceph_get_snap_context((void *)page->private);
962 unlock_page(page);
963 ceph_queue_writeback(inode);
964 wait_event_interruptible(ci->i_cap_wq,
965 context_is_writeable_or_written(inode, snapc));
966 ceph_put_snap_context(snapc);
967 return -EAGAIN;
968 }
969
970 /* yay, writeable, do it now (without dropping page lock) */
971 dout(" page %p snapc %p not current, but oldest\n",
972 page, snapc);
973 if (!clear_page_dirty_for_io(page))
974 goto retry_locked;
975 r = writepage_nounlock(page, NULL);
976 if (r < 0)
977 goto fail_nosnap;
978 goto retry_locked;
979 }
980
981 if (PageUptodate(page)) {
982 dout(" page %p already uptodate\n", page);
983 return 0;
984 }
985
986 /* full page? */
987 if (pos_in_page == 0 && len == PAGE_CACHE_SIZE)
988 return 0;
989
990 /* past end of file? */
991 i_size = inode->i_size; /* caller holds i_mutex */
992
993 if (i_size + len > inode->i_sb->s_maxbytes) {
994 /* file is too big */
995 r = -EINVAL;
996 goto fail;
997 }
998
999 if (page_off >= i_size ||
1000 (pos_in_page == 0 && (pos+len) >= i_size &&
1001 end_in_page - pos_in_page != PAGE_CACHE_SIZE)) {
1002 dout(" zeroing %p 0 - %d and %d - %d\n",
1003 page, pos_in_page, end_in_page, (int)PAGE_CACHE_SIZE);
1004 zero_user_segments(page,
1005 0, pos_in_page,
1006 end_in_page, PAGE_CACHE_SIZE);
1007 return 0;
1008 }
1009
1010 /* we need to read it. */
1011 up_read(&mdsc->snap_rwsem);
1012 r = readpage_nounlock(file, page);
1013 if (r < 0)
1014 goto fail_nosnap;
1015 goto retry_locked;
1016
1017fail:
1018 up_read(&mdsc->snap_rwsem);
1019fail_nosnap:
1020 unlock_page(page);
1021 return r;
1022}
1023
1024/*
1025 * We are only allowed to write into/dirty the page if the page is
1026 * clean, or already dirty within the same snap context.
1027 */
1028static int ceph_write_begin(struct file *file, struct address_space *mapping,
1029 loff_t pos, unsigned len, unsigned flags,
1030 struct page **pagep, void **fsdata)
1031{
1032 struct inode *inode = file->f_dentry->d_inode;
1033 struct page *page;
1034 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1035 int r;
1036
1037 do {
1038 /* get a page*/
1039 page = grab_cache_page_write_begin(mapping, index, 0);
1040 if (!page)
1041 return -ENOMEM;
1042 *pagep = page;
1043
1044 dout("write_begin file %p inode %p page %p %d~%d\n", file,
1045 inode, page, (int)pos, (int)len);
1046
1047 r = ceph_update_writeable_page(file, pos, len, page);
1048 } while (r == -EAGAIN);
1049
1050 return r;
1051}
1052
1053/*
1054 * we don't do anything in here that simple_write_end doesn't do
1055 * except adjust dirty page accounting and drop read lock on
1056 * mdsc->snap_rwsem.
1057 */
1058static int ceph_write_end(struct file *file, struct address_space *mapping,
1059 loff_t pos, unsigned len, unsigned copied,
1060 struct page *page, void *fsdata)
1061{
1062 struct inode *inode = file->f_dentry->d_inode;
1063 struct ceph_client *client = ceph_inode_to_client(inode);
1064 struct ceph_mds_client *mdsc = &client->mdsc;
1065 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
1066 int check_cap = 0;
1067
1068 dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
1069 inode, page, (int)pos, (int)copied, (int)len);
1070
1071 /* zero the stale part of the page if we did a short copy */
1072 if (copied < len)
1073 zero_user_segment(page, from+copied, len);
1074
1075 /* did file size increase? */
1076 /* (no need for i_size_read(); we caller holds i_mutex */
1077 if (pos+copied > inode->i_size)
1078 check_cap = ceph_inode_set_size(inode, pos+copied);
1079
1080 if (!PageUptodate(page))
1081 SetPageUptodate(page);
1082
1083 set_page_dirty(page);
1084
1085 unlock_page(page);
1086 up_read(&mdsc->snap_rwsem);
1087 page_cache_release(page);
1088
1089 if (check_cap)
1090 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);
1091
1092 return copied;
1093}
1094
1095/*
1096 * we set .direct_IO to indicate direct io is supported, but since we
1097 * intercept O_DIRECT reads and writes early, this function should
1098 * never get called.
1099 */
1100static ssize_t ceph_direct_io(int rw, struct kiocb *iocb,
1101 const struct iovec *iov,
1102 loff_t pos, unsigned long nr_segs)
1103{
1104 WARN_ON(1);
1105 return -EINVAL;
1106}
1107
1108const struct address_space_operations ceph_aops = {
1109 .readpage = ceph_readpage,
1110 .readpages = ceph_readpages,
1111 .writepage = ceph_writepage,
1112 .writepages = ceph_writepages_start,
1113 .write_begin = ceph_write_begin,
1114 .write_end = ceph_write_end,
1115 .set_page_dirty = ceph_set_page_dirty,
1116 .invalidatepage = ceph_invalidatepage,
1117 .releasepage = ceph_releasepage,
1118 .direct_IO = ceph_direct_io,
1119};
1120
1121
1122/*
1123 * vm ops
1124 */
1125
1126/*
1127 * Reuse write_begin here for simplicity.
1128 */
1129static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1130{
1131 struct inode *inode = vma->vm_file->f_dentry->d_inode;
1132 struct page *page = vmf->page;
1133 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
1134 loff_t off = page->index << PAGE_CACHE_SHIFT;
1135 loff_t size, len;
1136 int ret;
1137
1138 size = i_size_read(inode);
1139 if (off + PAGE_CACHE_SIZE <= size)
1140 len = PAGE_CACHE_SIZE;
1141 else
1142 len = size & ~PAGE_CACHE_MASK;
1143
1144 dout("page_mkwrite %p %llu~%llu page %p idx %lu\n", inode,
1145 off, len, page, page->index);
1146
1147 lock_page(page);
1148
1149 ret = VM_FAULT_NOPAGE;
1150 if ((off > size) ||
1151 (page->mapping != inode->i_mapping))
1152 goto out;
1153
1154 ret = ceph_update_writeable_page(vma->vm_file, off, len, page);
1155 if (ret == 0) {
1156 /* success. we'll keep the page locked. */
1157 set_page_dirty(page);
1158 up_read(&mdsc->snap_rwsem);
1159 ret = VM_FAULT_LOCKED;
1160 } else {
1161 if (ret == -ENOMEM)
1162 ret = VM_FAULT_OOM;
1163 else
1164 ret = VM_FAULT_SIGBUS;
1165 }
1166out:
1167 dout("page_mkwrite %p %llu~%llu = %d\n", inode, off, len, ret);
1168 if (ret != VM_FAULT_LOCKED)
1169 unlock_page(page);
1170 return ret;
1171}
1172
1173static struct vm_operations_struct ceph_vmops = {
1174 .fault = filemap_fault,
1175 .page_mkwrite = ceph_page_mkwrite,
1176};
1177
1178int ceph_mmap(struct file *file, struct vm_area_struct *vma)
1179{
1180 struct address_space *mapping = file->f_mapping;
1181
1182 if (!mapping->a_ops->readpage)
1183 return -ENOEXEC;
1184 file_accessed(file);
1185 vma->vm_ops = &ceph_vmops;
1186 vma->vm_flags |= VM_CAN_NONLINEAR;
1187 return 0;
1188}
diff --git a/fs/ceph/armor.c b/fs/ceph/armor.c
new file mode 100644
index 000000000000..67b2c030924b
--- /dev/null
+++ b/fs/ceph/armor.c
@@ -0,0 +1,99 @@
1
2#include <linux/errno.h>
3
4/*
5 * base64 encode/decode.
6 */
7
8const char *pem_key = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
9
10static int encode_bits(int c)
11{
12 return pem_key[c];
13}
14
15static int decode_bits(char c)
16{
17 if (c >= 'A' && c <= 'Z')
18 return c - 'A';
19 if (c >= 'a' && c <= 'z')
20 return c - 'a' + 26;
21 if (c >= '0' && c <= '9')
22 return c - '0' + 52;
23 if (c == '+')
24 return 62;
25 if (c == '/')
26 return 63;
27 if (c == '=')
28 return 0; /* just non-negative, please */
29 return -EINVAL;
30}
31
32int ceph_armor(char *dst, const char *src, const char *end)
33{
34 int olen = 0;
35 int line = 0;
36
37 while (src < end) {
38 unsigned char a, b, c;
39
40 a = *src++;
41 *dst++ = encode_bits(a >> 2);
42 if (src < end) {
43 b = *src++;
44 *dst++ = encode_bits(((a & 3) << 4) | (b >> 4));
45 if (src < end) {
46 c = *src++;
47 *dst++ = encode_bits(((b & 15) << 2) |
48 (c >> 6));
49 *dst++ = encode_bits(c & 63);
50 } else {
51 *dst++ = encode_bits((b & 15) << 2);
52 *dst++ = '=';
53 }
54 } else {
55 *dst++ = encode_bits(((a & 3) << 4));
56 *dst++ = '=';
57 *dst++ = '=';
58 }
59 olen += 4;
60 line += 4;
61 if (line == 64) {
62 line = 0;
63 *(dst++) = '\n';
64 olen++;
65 }
66 }
67 return olen;
68}
69
70int ceph_unarmor(char *dst, const char *src, const char *end)
71{
72 int olen = 0;
73
74 while (src < end) {
75 int a, b, c, d;
76
77 if (src < end && src[0] == '\n')
78 src++;
79 if (src + 4 > end)
80 return -EINVAL;
81 a = decode_bits(src[0]);
82 b = decode_bits(src[1]);
83 c = decode_bits(src[2]);
84 d = decode_bits(src[3]);
85 if (a < 0 || b < 0 || c < 0 || d < 0)
86 return -EINVAL;
87
88 *dst++ = (a << 2) | (b >> 4);
89 if (src[2] == '=')
90 return olen + 1;
91 *dst++ = ((b & 15) << 4) | (c >> 2);
92 if (src[3] == '=')
93 return olen + 2;
94 *dst++ = ((c & 3) << 6) | d;
95 olen += 3;
96 src += 4;
97 }
98 return olen;
99}
diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c
new file mode 100644
index 000000000000..abb204fea6c7
--- /dev/null
+++ b/fs/ceph/auth.c
@@ -0,0 +1,257 @@
1#include "ceph_debug.h"
2
3#include <linux/module.h>
4#include <linux/err.h>
5
6#include "types.h"
7#include "auth_none.h"
8#include "auth_x.h"
9#include "decode.h"
10#include "super.h"
11
12#include "messenger.h"
13
14/*
15 * get protocol handler
16 */
17static u32 supported_protocols[] = {
18 CEPH_AUTH_NONE,
19 CEPH_AUTH_CEPHX
20};
21
22int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
23{
24 switch (protocol) {
25 case CEPH_AUTH_NONE:
26 return ceph_auth_none_init(ac);
27 case CEPH_AUTH_CEPHX:
28 return ceph_x_init(ac);
29 default:
30 return -ENOENT;
31 }
32}
33
34/*
35 * setup, teardown.
36 */
37struct ceph_auth_client *ceph_auth_init(const char *name, const char *secret)
38{
39 struct ceph_auth_client *ac;
40 int ret;
41
42 dout("auth_init name '%s' secret '%s'\n", name, secret);
43
44 ret = -ENOMEM;
45 ac = kzalloc(sizeof(*ac), GFP_NOFS);
46 if (!ac)
47 goto out;
48
49 ac->negotiating = true;
50 if (name)
51 ac->name = name;
52 else
53 ac->name = CEPH_AUTH_NAME_DEFAULT;
54 dout("auth_init name %s secret %s\n", ac->name, secret);
55 ac->secret = secret;
56 return ac;
57
58out:
59 return ERR_PTR(ret);
60}
61
62void ceph_auth_destroy(struct ceph_auth_client *ac)
63{
64 dout("auth_destroy %p\n", ac);
65 if (ac->ops)
66 ac->ops->destroy(ac);
67 kfree(ac);
68}
69
70/*
71 * Reset occurs when reconnecting to the monitor.
72 */
73void ceph_auth_reset(struct ceph_auth_client *ac)
74{
75 dout("auth_reset %p\n", ac);
76 if (ac->ops && !ac->negotiating)
77 ac->ops->reset(ac);
78 ac->negotiating = true;
79}
80
81int ceph_entity_name_encode(const char *name, void **p, void *end)
82{
83 int len = strlen(name);
84
85 if (*p + 2*sizeof(u32) + len > end)
86 return -ERANGE;
87 ceph_encode_32(p, CEPH_ENTITY_TYPE_CLIENT);
88 ceph_encode_32(p, len);
89 ceph_encode_copy(p, name, len);
90 return 0;
91}
92
93/*
94 * Initiate protocol negotiation with monitor. Include entity name
95 * and list supported protocols.
96 */
97int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
98{
99 struct ceph_mon_request_header *monhdr = buf;
100 void *p = monhdr + 1, *end = buf + len, *lenp;
101 int i, num;
102 int ret;
103
104 dout("auth_build_hello\n");
105 monhdr->have_version = 0;
106 monhdr->session_mon = cpu_to_le16(-1);
107 monhdr->session_mon_tid = 0;
108
109 ceph_encode_32(&p, 0); /* no protocol, yet */
110
111 lenp = p;
112 p += sizeof(u32);
113
114 ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
115 ceph_encode_8(&p, 1);
116 num = ARRAY_SIZE(supported_protocols);
117 ceph_encode_32(&p, num);
118 ceph_decode_need(&p, end, num * sizeof(u32), bad);
119 for (i = 0; i < num; i++)
120 ceph_encode_32(&p, supported_protocols[i]);
121
122 ret = ceph_entity_name_encode(ac->name, &p, end);
123 if (ret < 0)
124 return ret;
125 ceph_decode_need(&p, end, sizeof(u64), bad);
126 ceph_encode_64(&p, ac->global_id);
127
128 ceph_encode_32(&lenp, p - lenp - sizeof(u32));
129 return p - buf;
130
131bad:
132 return -ERANGE;
133}
134
135int ceph_build_auth_request(struct ceph_auth_client *ac,
136 void *msg_buf, size_t msg_len)
137{
138 struct ceph_mon_request_header *monhdr = msg_buf;
139 void *p = monhdr + 1;
140 void *end = msg_buf + msg_len;
141 int ret;
142
143 monhdr->have_version = 0;
144 monhdr->session_mon = cpu_to_le16(-1);
145 monhdr->session_mon_tid = 0;
146
147 ceph_encode_32(&p, ac->protocol);
148
149 ret = ac->ops->build_request(ac, p + sizeof(u32), end);
150 if (ret < 0) {
151 pr_err("error %d building request\n", ret);
152 return ret;
153 }
154 dout(" built request %d bytes\n", ret);
155 ceph_encode_32(&p, ret);
156 return p + ret - msg_buf;
157}
158
159/*
160 * Handle auth message from monitor.
161 */
162int ceph_handle_auth_reply(struct ceph_auth_client *ac,
163 void *buf, size_t len,
164 void *reply_buf, size_t reply_len)
165{
166 void *p = buf;
167 void *end = buf + len;
168 int protocol;
169 s32 result;
170 u64 global_id;
171 void *payload, *payload_end;
172 int payload_len;
173 char *result_msg;
174 int result_msg_len;
175 int ret = -EINVAL;
176
177 dout("handle_auth_reply %p %p\n", p, end);
178 ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad);
179 protocol = ceph_decode_32(&p);
180 result = ceph_decode_32(&p);
181 global_id = ceph_decode_64(&p);
182 payload_len = ceph_decode_32(&p);
183 payload = p;
184 p += payload_len;
185 ceph_decode_need(&p, end, sizeof(u32), bad);
186 result_msg_len = ceph_decode_32(&p);
187 result_msg = p;
188 p += result_msg_len;
189 if (p != end)
190 goto bad;
191
192 dout(" result %d '%.*s' gid %llu len %d\n", result, result_msg_len,
193 result_msg, global_id, payload_len);
194
195 payload_end = payload + payload_len;
196
197 if (global_id && ac->global_id != global_id) {
198 dout(" set global_id %lld -> %lld\n", ac->global_id, global_id);
199 ac->global_id = global_id;
200 }
201
202 if (ac->negotiating) {
203 /* server does not support our protocols? */
204 if (!protocol && result < 0) {
205 ret = result;
206 goto out;
207 }
208 /* set up (new) protocol handler? */
209 if (ac->protocol && ac->protocol != protocol) {
210 ac->ops->destroy(ac);
211 ac->protocol = 0;
212 ac->ops = NULL;
213 }
214 if (ac->protocol != protocol) {
215 ret = ceph_auth_init_protocol(ac, protocol);
216 if (ret) {
217 pr_err("error %d on auth protocol %d init\n",
218 ret, protocol);
219 goto out;
220 }
221 }
222
223 ac->negotiating = false;
224 }
225
226 ret = ac->ops->handle_reply(ac, result, payload, payload_end);
227 if (ret == -EAGAIN) {
228 return ceph_build_auth_request(ac, reply_buf, reply_len);
229 } else if (ret) {
230 pr_err("authentication error %d\n", ret);
231 return ret;
232 }
233 return 0;
234
235bad:
236 pr_err("failed to decode auth msg\n");
237out:
238 return ret;
239}
240
241int ceph_build_auth(struct ceph_auth_client *ac,
242 void *msg_buf, size_t msg_len)
243{
244 if (!ac->protocol)
245 return ceph_auth_build_hello(ac, msg_buf, msg_len);
246 BUG_ON(!ac->ops);
247 if (!ac->ops->is_authenticated(ac))
248 return ceph_build_auth_request(ac, msg_buf, msg_len);
249 return 0;
250}
251
252int ceph_auth_is_authenticated(struct ceph_auth_client *ac)
253{
254 if (!ac->ops)
255 return 0;
256 return ac->ops->is_authenticated(ac);
257}
diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h
new file mode 100644
index 000000000000..ca4f57cfb267
--- /dev/null
+++ b/fs/ceph/auth.h
@@ -0,0 +1,84 @@
1#ifndef _FS_CEPH_AUTH_H
2#define _FS_CEPH_AUTH_H
3
4#include "types.h"
5#include "buffer.h"
6
7/*
8 * Abstract interface for communicating with the authenticate module.
9 * There is some handshake that takes place between us and the monitor
10 * to acquire the necessary keys. These are used to generate an
11 * 'authorizer' that we use when connecting to a service (mds, osd).
12 */
13
14struct ceph_auth_client;
15struct ceph_authorizer;
16
17struct ceph_auth_client_ops {
18 /*
19 * true if we are authenticated and can connect to
20 * services.
21 */
22 int (*is_authenticated)(struct ceph_auth_client *ac);
23
24 /*
25 * build requests and process replies during monitor
26 * handshake. if handle_reply returns -EAGAIN, we build
27 * another request.
28 */
29 int (*build_request)(struct ceph_auth_client *ac, void *buf, void *end);
30 int (*handle_reply)(struct ceph_auth_client *ac, int result,
31 void *buf, void *end);
32
33 /*
34 * Create authorizer for connecting to a service, and verify
35 * the response to authenticate the service.
36 */
37 int (*create_authorizer)(struct ceph_auth_client *ac, int peer_type,
38 struct ceph_authorizer **a,
39 void **buf, size_t *len,
40 void **reply_buf, size_t *reply_len);
41 int (*verify_authorizer_reply)(struct ceph_auth_client *ac,
42 struct ceph_authorizer *a, size_t len);
43 void (*destroy_authorizer)(struct ceph_auth_client *ac,
44 struct ceph_authorizer *a);
45 void (*invalidate_authorizer)(struct ceph_auth_client *ac,
46 int peer_type);
47
48 /* reset when we (re)connect to a monitor */
49 void (*reset)(struct ceph_auth_client *ac);
50
51 void (*destroy)(struct ceph_auth_client *ac);
52};
53
54struct ceph_auth_client {
55 u32 protocol; /* CEPH_AUTH_* */
56 void *private; /* for use by protocol implementation */
57 const struct ceph_auth_client_ops *ops; /* null iff protocol==0 */
58
59 bool negotiating; /* true if negotiating protocol */
60 const char *name; /* entity name */
61 u64 global_id; /* our unique id in system */
62 const char *secret; /* our secret key */
63 unsigned want_keys; /* which services we want */
64};
65
66extern struct ceph_auth_client *ceph_auth_init(const char *name,
67 const char *secret);
68extern void ceph_auth_destroy(struct ceph_auth_client *ac);
69
70extern void ceph_auth_reset(struct ceph_auth_client *ac);
71
72extern int ceph_auth_build_hello(struct ceph_auth_client *ac,
73 void *buf, size_t len);
74extern int ceph_handle_auth_reply(struct ceph_auth_client *ac,
75 void *buf, size_t len,
76 void *reply_buf, size_t reply_len);
77extern int ceph_entity_name_encode(const char *name, void **p, void *end);
78
79extern int ceph_build_auth(struct ceph_auth_client *ac,
80 void *msg_buf, size_t msg_len);
81
82extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac);
83
84#endif
diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c
new file mode 100644
index 000000000000..b4ef6f0a6c85
--- /dev/null
+++ b/fs/ceph/auth_none.c
@@ -0,0 +1,121 @@
1
2#include "ceph_debug.h"
3
4#include <linux/err.h>
5#include <linux/module.h>
6#include <linux/random.h>
7
8#include "auth_none.h"
9#include "auth.h"
10#include "decode.h"
11
12static void reset(struct ceph_auth_client *ac)
13{
14 struct ceph_auth_none_info *xi = ac->private;
15
16 xi->starting = true;
17 xi->built_authorizer = false;
18}
19
20static void destroy(struct ceph_auth_client *ac)
21{
22 kfree(ac->private);
23 ac->private = NULL;
24}
25
26static int is_authenticated(struct ceph_auth_client *ac)
27{
28 struct ceph_auth_none_info *xi = ac->private;
29
30 return !xi->starting;
31}
32
33/*
34 * the generic auth code decode the global_id, and we carry no actual
35 * authenticate state, so nothing happens here.
36 */
37static int handle_reply(struct ceph_auth_client *ac, int result,
38 void *buf, void *end)
39{
40 struct ceph_auth_none_info *xi = ac->private;
41
42 xi->starting = false;
43 return result;
44}
45
46/*
47 * build an 'authorizer' with our entity_name and global_id. we can
48 * reuse a single static copy since it is identical for all services
49 * we connect to.
50 */
51static int ceph_auth_none_create_authorizer(
52 struct ceph_auth_client *ac, int peer_type,
53 struct ceph_authorizer **a,
54 void **buf, size_t *len,
55 void **reply_buf, size_t *reply_len)
56{
57 struct ceph_auth_none_info *ai = ac->private;
58 struct ceph_none_authorizer *au = &ai->au;
59 void *p, *end;
60 int ret;
61
62 if (!ai->built_authorizer) {
63 p = au->buf;
64 end = p + sizeof(au->buf);
65 ceph_encode_8(&p, 1);
66 ret = ceph_entity_name_encode(ac->name, &p, end - 8);
67 if (ret < 0)
68 goto bad;
69 ceph_decode_need(&p, end, sizeof(u64), bad2);
70 ceph_encode_64(&p, ac->global_id);
71 au->buf_len = p - (void *)au->buf;
72 ai->built_authorizer = true;
73 dout("built authorizer len %d\n", au->buf_len);
74 }
75
76 *a = (struct ceph_authorizer *)au;
77 *buf = au->buf;
78 *len = au->buf_len;
79 *reply_buf = au->reply_buf;
80 *reply_len = sizeof(au->reply_buf);
81 return 0;
82
83bad2:
84 ret = -ERANGE;
85bad:
86 return ret;
87}
88
89static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
90 struct ceph_authorizer *a)
91{
92 /* nothing to do */
93}
94
95static const struct ceph_auth_client_ops ceph_auth_none_ops = {
96 .reset = reset,
97 .destroy = destroy,
98 .is_authenticated = is_authenticated,
99 .handle_reply = handle_reply,
100 .create_authorizer = ceph_auth_none_create_authorizer,
101 .destroy_authorizer = ceph_auth_none_destroy_authorizer,
102};
103
104int ceph_auth_none_init(struct ceph_auth_client *ac)
105{
106 struct ceph_auth_none_info *xi;
107
108 dout("ceph_auth_none_init %p\n", ac);
109 xi = kzalloc(sizeof(*xi), GFP_NOFS);
110 if (!xi)
111 return -ENOMEM;
112
113 xi->starting = true;
114 xi->built_authorizer = false;
115
116 ac->protocol = CEPH_AUTH_NONE;
117 ac->private = xi;
118 ac->ops = &ceph_auth_none_ops;
119 return 0;
120}
121
diff --git a/fs/ceph/auth_none.h b/fs/ceph/auth_none.h
new file mode 100644
index 000000000000..56c05533a31c
--- /dev/null
+++ b/fs/ceph/auth_none.h
@@ -0,0 +1,28 @@
1#ifndef _FS_CEPH_AUTH_NONE_H
2#define _FS_CEPH_AUTH_NONE_H
3
4#include "auth.h"
5
6/*
7 * null security mode.
8 *
9 * we use a single static authorizer that simply encodes our entity name
10 * and global id.
11 */
12
13struct ceph_none_authorizer {
14 char buf[128];
15 int buf_len;
16 char reply_buf[0];
17};
18
19struct ceph_auth_none_info {
20 bool starting;
21 bool built_authorizer;
22 struct ceph_none_authorizer au; /* we only need one; it's static */
23};
24
25extern int ceph_auth_none_init(struct ceph_auth_client *ac);
26
27#endif
28
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
new file mode 100644
index 000000000000..f0318427b6da
--- /dev/null
+++ b/fs/ceph/auth_x.c
@@ -0,0 +1,656 @@
1
2#include "ceph_debug.h"
3
4#include <linux/err.h>
5#include <linux/module.h>
6#include <linux/random.h>
7
8#include "auth_x.h"
9#include "auth_x_protocol.h"
10#include "crypto.h"
11#include "auth.h"
12#include "decode.h"
13
14struct kmem_cache *ceph_x_ticketbuf_cachep;
15
16#define TEMP_TICKET_BUF_LEN 256
17
18static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed);
19
20static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
21{
22 struct ceph_x_info *xi = ac->private;
23 int need;
24
25 ceph_x_validate_tickets(ac, &need);
26 dout("ceph_x_is_authenticated want=%d need=%d have=%d\n",
27 ac->want_keys, need, xi->have_keys);
28 return (ac->want_keys & xi->have_keys) == ac->want_keys;
29}
30
31static int ceph_x_encrypt(struct ceph_crypto_key *secret,
32 void *ibuf, int ilen, void *obuf, size_t olen)
33{
34 struct ceph_x_encrypt_header head = {
35 .struct_v = 1,
36 .magic = cpu_to_le64(CEPHX_ENC_MAGIC)
37 };
38 size_t len = olen - sizeof(u32);
39 int ret;
40
41 ret = ceph_encrypt2(secret, obuf + sizeof(u32), &len,
42 &head, sizeof(head), ibuf, ilen);
43 if (ret)
44 return ret;
45 ceph_encode_32(&obuf, len);
46 return len + sizeof(u32);
47}
48
49static int ceph_x_decrypt(struct ceph_crypto_key *secret,
50 void **p, void *end, void *obuf, size_t olen)
51{
52 struct ceph_x_encrypt_header head;
53 size_t head_len = sizeof(head);
54 int len, ret;
55
56 len = ceph_decode_32(p);
57 if (*p + len > end)
58 return -EINVAL;
59
60 dout("ceph_x_decrypt len %d\n", len);
61 ret = ceph_decrypt2(secret, &head, &head_len, obuf, &olen,
62 *p, len);
63 if (ret)
64 return ret;
65 if (head.struct_v != 1 || le64_to_cpu(head.magic) != CEPHX_ENC_MAGIC)
66 return -EPERM;
67 *p += len;
68 return olen;
69}
70
71/*
72 * get existing (or insert new) ticket handler
73 */
74struct ceph_x_ticket_handler *get_ticket_handler(struct ceph_auth_client *ac,
75 int service)
76{
77 struct ceph_x_ticket_handler *th;
78 struct ceph_x_info *xi = ac->private;
79 struct rb_node *parent = NULL, **p = &xi->ticket_handlers.rb_node;
80
81 while (*p) {
82 parent = *p;
83 th = rb_entry(parent, struct ceph_x_ticket_handler, node);
84 if (service < th->service)
85 p = &(*p)->rb_left;
86 else if (service > th->service)
87 p = &(*p)->rb_right;
88 else
89 return th;
90 }
91
92 /* add it */
93 th = kzalloc(sizeof(*th), GFP_NOFS);
94 if (!th)
95 return ERR_PTR(-ENOMEM);
96 th->service = service;
97 rb_link_node(&th->node, parent, p);
98 rb_insert_color(&th->node, &xi->ticket_handlers);
99 return th;
100}
101
102static void remove_ticket_handler(struct ceph_auth_client *ac,
103 struct ceph_x_ticket_handler *th)
104{
105 struct ceph_x_info *xi = ac->private;
106
107 dout("remove_ticket_handler %p %d\n", th, th->service);
108 rb_erase(&th->node, &xi->ticket_handlers);
109 ceph_crypto_key_destroy(&th->session_key);
110 if (th->ticket_blob)
111 ceph_buffer_put(th->ticket_blob);
112 kfree(th);
113}
114
115static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
116 struct ceph_crypto_key *secret,
117 void *buf, void *end)
118{
119 struct ceph_x_info *xi = ac->private;
120 int num;
121 void *p = buf;
122 int ret;
123 char *dbuf;
124 char *ticket_buf;
125 u8 struct_v;
126
127 dbuf = kmem_cache_alloc(ceph_x_ticketbuf_cachep, GFP_NOFS | GFP_ATOMIC);
128 if (!dbuf)
129 return -ENOMEM;
130
131 ret = -ENOMEM;
132 ticket_buf = kmem_cache_alloc(ceph_x_ticketbuf_cachep,
133 GFP_NOFS | GFP_ATOMIC);
134 if (!ticket_buf)
135 goto out_dbuf;
136
137 ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
138 struct_v = ceph_decode_8(&p);
139 if (struct_v != 1)
140 goto bad;
141 num = ceph_decode_32(&p);
142 dout("%d tickets\n", num);
143 while (num--) {
144 int type;
145 u8 struct_v;
146 struct ceph_x_ticket_handler *th;
147 void *dp, *dend;
148 int dlen;
149 char is_enc;
150 struct timespec validity;
151 struct ceph_crypto_key old_key;
152 void *tp, *tpend;
153
154 ceph_decode_need(&p, end, sizeof(u32) + 1, bad);
155
156 type = ceph_decode_32(&p);
157 dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
158
159 struct_v = ceph_decode_8(&p);
160 if (struct_v != 1)
161 goto bad;
162
163 th = get_ticket_handler(ac, type);
164 if (IS_ERR(th)) {
165 ret = PTR_ERR(th);
166 goto out;
167 }
168
169 /* blob for me */
170 dlen = ceph_x_decrypt(secret, &p, end, dbuf,
171 TEMP_TICKET_BUF_LEN);
172 if (dlen <= 0) {
173 ret = dlen;
174 goto out;
175 }
176 dout(" decrypted %d bytes\n", dlen);
177 dend = dbuf + dlen;
178 dp = dbuf;
179
180 struct_v = ceph_decode_8(&dp);
181 if (struct_v != 1)
182 goto bad;
183
184 memcpy(&old_key, &th->session_key, sizeof(old_key));
185 ret = ceph_crypto_key_decode(&th->session_key, &dp, dend);
186 if (ret)
187 goto out;
188
189 ceph_decode_copy(&dp, &th->validity, sizeof(th->validity));
190 ceph_decode_timespec(&validity, &th->validity);
191 th->expires = get_seconds() + validity.tv_sec;
192 th->renew_after = th->expires - (validity.tv_sec / 4);
193 dout(" expires=%lu renew_after=%lu\n", th->expires,
194 th->renew_after);
195
196 /* ticket blob for service */
197 ceph_decode_8_safe(&p, end, is_enc, bad);
198 tp = ticket_buf;
199 if (is_enc) {
200 /* encrypted */
201 dout(" encrypted ticket\n");
202 dlen = ceph_x_decrypt(&old_key, &p, end, ticket_buf,
203 TEMP_TICKET_BUF_LEN);
204 if (dlen < 0) {
205 ret = dlen;
206 goto out;
207 }
208 dlen = ceph_decode_32(&tp);
209 } else {
210 /* unencrypted */
211 ceph_decode_32_safe(&p, end, dlen, bad);
212 ceph_decode_need(&p, end, dlen, bad);
213 ceph_decode_copy(&p, ticket_buf, dlen);
214 }
215 tpend = tp + dlen;
216 dout(" ticket blob is %d bytes\n", dlen);
217 ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
218 struct_v = ceph_decode_8(&tp);
219 th->secret_id = ceph_decode_64(&tp);
220 ret = ceph_decode_buffer(&th->ticket_blob, &tp, tpend);
221 if (ret)
222 goto out;
223 dout(" got ticket service %d (%s) secret_id %lld len %d\n",
224 type, ceph_entity_type_name(type), th->secret_id,
225 (int)th->ticket_blob->vec.iov_len);
226 xi->have_keys |= th->service;
227 }
228
229 ret = 0;
230out:
231 kmem_cache_free(ceph_x_ticketbuf_cachep, ticket_buf);
232out_dbuf:
233 kmem_cache_free(ceph_x_ticketbuf_cachep, dbuf);
234 return ret;
235
236bad:
237 ret = -EINVAL;
238 goto out;
239}
240
241static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
242 struct ceph_x_ticket_handler *th,
243 struct ceph_x_authorizer *au)
244{
245 int len;
246 struct ceph_x_authorize_a *msg_a;
247 struct ceph_x_authorize_b msg_b;
248 void *p, *end;
249 int ret;
250 int ticket_blob_len =
251 (th->ticket_blob ? th->ticket_blob->vec.iov_len : 0);
252
253 dout("build_authorizer for %s %p\n",
254 ceph_entity_type_name(th->service), au);
255
256 len = sizeof(*msg_a) + sizeof(msg_b) + sizeof(u32) +
257 ticket_blob_len + 16;
258 dout(" need len %d\n", len);
259 if (au->buf && au->buf->alloc_len < len) {
260 ceph_buffer_put(au->buf);
261 au->buf = NULL;
262 }
263 if (!au->buf) {
264 au->buf = ceph_buffer_new(len, GFP_NOFS);
265 if (!au->buf)
266 return -ENOMEM;
267 }
268 au->service = th->service;
269
270 msg_a = au->buf->vec.iov_base;
271 msg_a->struct_v = 1;
272 msg_a->global_id = cpu_to_le64(ac->global_id);
273 msg_a->service_id = cpu_to_le32(th->service);
274 msg_a->ticket_blob.struct_v = 1;
275 msg_a->ticket_blob.secret_id = cpu_to_le64(th->secret_id);
276 msg_a->ticket_blob.blob_len = cpu_to_le32(ticket_blob_len);
277 if (ticket_blob_len) {
278 memcpy(msg_a->ticket_blob.blob, th->ticket_blob->vec.iov_base,
279 th->ticket_blob->vec.iov_len);
280 }
281 dout(" th %p secret_id %lld %lld\n", th, th->secret_id,
282 le64_to_cpu(msg_a->ticket_blob.secret_id));
283
284 p = msg_a + 1;
285 p += ticket_blob_len;
286 end = au->buf->vec.iov_base + au->buf->vec.iov_len;
287
288 get_random_bytes(&au->nonce, sizeof(au->nonce));
289 msg_b.struct_v = 1;
290 msg_b.nonce = cpu_to_le64(au->nonce);
291 ret = ceph_x_encrypt(&th->session_key, &msg_b, sizeof(msg_b),
292 p, end - p);
293 if (ret < 0)
294 goto out_buf;
295 p += ret;
296 au->buf->vec.iov_len = p - au->buf->vec.iov_base;
297 dout(" built authorizer nonce %llx len %d\n", au->nonce,
298 (int)au->buf->vec.iov_len);
299 return 0;
300
301out_buf:
302 ceph_buffer_put(au->buf);
303 au->buf = NULL;
304 return ret;
305}
306
307static int ceph_x_encode_ticket(struct ceph_x_ticket_handler *th,
308 void **p, void *end)
309{
310 ceph_decode_need(p, end, 1 + sizeof(u64), bad);
311 ceph_encode_8(p, 1);
312 ceph_encode_64(p, th->secret_id);
313 if (th->ticket_blob) {
314 const char *buf = th->ticket_blob->vec.iov_base;
315 u32 len = th->ticket_blob->vec.iov_len;
316
317 ceph_encode_32_safe(p, end, len, bad);
318 ceph_encode_copy_safe(p, end, buf, len, bad);
319 } else {
320 ceph_encode_32_safe(p, end, 0, bad);
321 }
322
323 return 0;
324bad:
325 return -ERANGE;
326}
327
328static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
329{
330 int want = ac->want_keys;
331 struct ceph_x_info *xi = ac->private;
332 int service;
333
334 *pneed = ac->want_keys & ~(xi->have_keys);
335
336 for (service = 1; service <= want; service <<= 1) {
337 struct ceph_x_ticket_handler *th;
338
339 if (!(ac->want_keys & service))
340 continue;
341
342 if (*pneed & service)
343 continue;
344
345 th = get_ticket_handler(ac, service);
346
347 if (!th) {
348 *pneed |= service;
349 continue;
350 }
351
352 if (get_seconds() >= th->renew_after)
353 *pneed |= service;
354 if (get_seconds() >= th->expires)
355 xi->have_keys &= ~service;
356 }
357}
358
359
360static int ceph_x_build_request(struct ceph_auth_client *ac,
361 void *buf, void *end)
362{
363 struct ceph_x_info *xi = ac->private;
364 int need;
365 struct ceph_x_request_header *head = buf;
366 int ret;
367 struct ceph_x_ticket_handler *th =
368 get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
369
370 ceph_x_validate_tickets(ac, &need);
371
372 dout("build_request want %x have %x need %x\n",
373 ac->want_keys, xi->have_keys, need);
374
375 if (need & CEPH_ENTITY_TYPE_AUTH) {
376 struct ceph_x_authenticate *auth = (void *)(head + 1);
377 void *p = auth + 1;
378 struct ceph_x_challenge_blob tmp;
379 char tmp_enc[40];
380 u64 *u;
381
382 if (p > end)
383 return -ERANGE;
384
385 dout(" get_auth_session_key\n");
386 head->op = cpu_to_le16(CEPHX_GET_AUTH_SESSION_KEY);
387
388 /* encrypt and hash */
389 get_random_bytes(&auth->client_challenge, sizeof(u64));
390 tmp.client_challenge = auth->client_challenge;
391 tmp.server_challenge = cpu_to_le64(xi->server_challenge);
392 ret = ceph_x_encrypt(&xi->secret, &tmp, sizeof(tmp),
393 tmp_enc, sizeof(tmp_enc));
394 if (ret < 0)
395 return ret;
396
397 auth->struct_v = 1;
398 auth->key = 0;
399 for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++)
400 auth->key ^= *u;
401 dout(" server_challenge %llx client_challenge %llx key %llx\n",
402 xi->server_challenge, le64_to_cpu(auth->client_challenge),
403 le64_to_cpu(auth->key));
404
405 /* now encode the old ticket if exists */
406 ret = ceph_x_encode_ticket(th, &p, end);
407 if (ret < 0)
408 return ret;
409
410 return p - buf;
411 }
412
413 if (need) {
414 void *p = head + 1;
415 struct ceph_x_service_ticket_request *req;
416
417 if (p > end)
418 return -ERANGE;
419 head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY);
420
421 BUG_ON(!th);
422 ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer);
423 if (ret)
424 return ret;
425 ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base,
426 xi->auth_authorizer.buf->vec.iov_len);
427
428 req = p;
429 req->keys = cpu_to_le32(need);
430 p += sizeof(*req);
431 return p - buf;
432 }
433
434 return 0;
435}
436
437static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
438 void *buf, void *end)
439{
440 struct ceph_x_info *xi = ac->private;
441 struct ceph_x_reply_header *head = buf;
442 struct ceph_x_ticket_handler *th;
443 int len = end - buf;
444 int op;
445 int ret;
446
447 if (result)
448 return result; /* XXX hmm? */
449
450 if (xi->starting) {
451 /* it's a hello */
452 struct ceph_x_server_challenge *sc = buf;
453
454 if (len != sizeof(*sc))
455 return -EINVAL;
456 xi->server_challenge = le64_to_cpu(sc->server_challenge);
457 dout("handle_reply got server challenge %llx\n",
458 xi->server_challenge);
459 xi->starting = false;
460 xi->have_keys &= ~CEPH_ENTITY_TYPE_AUTH;
461 return -EAGAIN;
462 }
463
464 op = le32_to_cpu(head->op);
465 result = le32_to_cpu(head->result);
466 dout("handle_reply op %d result %d\n", op, result);
467 switch (op) {
468 case CEPHX_GET_AUTH_SESSION_KEY:
469 /* verify auth key */
470 ret = ceph_x_proc_ticket_reply(ac, &xi->secret,
471 buf + sizeof(*head), end);
472 break;
473
474 case CEPHX_GET_PRINCIPAL_SESSION_KEY:
475 th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
476 BUG_ON(!th);
477 ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
478 buf + sizeof(*head), end);
479 break;
480
481 default:
482 return -EINVAL;
483 }
484 if (ret)
485 return ret;
486 if (ac->want_keys == xi->have_keys)
487 return 0;
488 return -EAGAIN;
489}
490
491static int ceph_x_create_authorizer(
492 struct ceph_auth_client *ac, int peer_type,
493 struct ceph_authorizer **a,
494 void **buf, size_t *len,
495 void **reply_buf, size_t *reply_len)
496{
497 struct ceph_x_authorizer *au;
498 struct ceph_x_ticket_handler *th;
499 int ret;
500
501 th = get_ticket_handler(ac, peer_type);
502 if (IS_ERR(th))
503 return PTR_ERR(th);
504
505 au = kzalloc(sizeof(*au), GFP_NOFS);
506 if (!au)
507 return -ENOMEM;
508
509 ret = ceph_x_build_authorizer(ac, th, au);
510 if (ret) {
511 kfree(au);
512 return ret;
513 }
514
515 *a = (struct ceph_authorizer *)au;
516 *buf = au->buf->vec.iov_base;
517 *len = au->buf->vec.iov_len;
518 *reply_buf = au->reply_buf;
519 *reply_len = sizeof(au->reply_buf);
520 return 0;
521}
522
523static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
524 struct ceph_authorizer *a, size_t len)
525{
526 struct ceph_x_authorizer *au = (void *)a;
527 struct ceph_x_ticket_handler *th;
528 int ret = 0;
529 struct ceph_x_authorize_reply reply;
530 void *p = au->reply_buf;
531 void *end = p + sizeof(au->reply_buf);
532
533 th = get_ticket_handler(ac, au->service);
534 if (!th)
535 return -EIO; /* hrm! */
536 ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply));
537 if (ret < 0)
538 return ret;
539 if (ret != sizeof(reply))
540 return -EPERM;
541
542 if (au->nonce + 1 != le64_to_cpu(reply.nonce_plus_one))
543 ret = -EPERM;
544 else
545 ret = 0;
546 dout("verify_authorizer_reply nonce %llx got %llx ret %d\n",
547 au->nonce, le64_to_cpu(reply.nonce_plus_one), ret);
548 return ret;
549}
550
551static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac,
552 struct ceph_authorizer *a)
553{
554 struct ceph_x_authorizer *au = (void *)a;
555
556 ceph_buffer_put(au->buf);
557 kfree(au);
558}
559
560
561static void ceph_x_reset(struct ceph_auth_client *ac)
562{
563 struct ceph_x_info *xi = ac->private;
564
565 dout("reset\n");
566 xi->starting = true;
567 xi->server_challenge = 0;
568}
569
570static void ceph_x_destroy(struct ceph_auth_client *ac)
571{
572 struct ceph_x_info *xi = ac->private;
573 struct rb_node *p;
574
575 dout("ceph_x_destroy %p\n", ac);
576 ceph_crypto_key_destroy(&xi->secret);
577
578 while ((p = rb_first(&xi->ticket_handlers)) != NULL) {
579 struct ceph_x_ticket_handler *th =
580 rb_entry(p, struct ceph_x_ticket_handler, node);
581 remove_ticket_handler(ac, th);
582 }
583
584 kmem_cache_destroy(ceph_x_ticketbuf_cachep);
585
586 kfree(ac->private);
587 ac->private = NULL;
588}
589
590static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
591 int peer_type)
592{
593 struct ceph_x_ticket_handler *th;
594
595 th = get_ticket_handler(ac, peer_type);
596 if (th && !IS_ERR(th))
597 remove_ticket_handler(ac, th);
598}
599
600
601static const struct ceph_auth_client_ops ceph_x_ops = {
602 .is_authenticated = ceph_x_is_authenticated,
603 .build_request = ceph_x_build_request,
604 .handle_reply = ceph_x_handle_reply,
605 .create_authorizer = ceph_x_create_authorizer,
606 .verify_authorizer_reply = ceph_x_verify_authorizer_reply,
607 .destroy_authorizer = ceph_x_destroy_authorizer,
608 .invalidate_authorizer = ceph_x_invalidate_authorizer,
609 .reset = ceph_x_reset,
610 .destroy = ceph_x_destroy,
611};
612
613
614int ceph_x_init(struct ceph_auth_client *ac)
615{
616 struct ceph_x_info *xi;
617 int ret;
618
619 dout("ceph_x_init %p\n", ac);
620 xi = kzalloc(sizeof(*xi), GFP_NOFS);
621 if (!xi)
622 return -ENOMEM;
623
624 ret = -ENOMEM;
625 ceph_x_ticketbuf_cachep = kmem_cache_create("ceph_x_ticketbuf",
626 TEMP_TICKET_BUF_LEN, 8,
627 (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
628 NULL);
629 if (!ceph_x_ticketbuf_cachep)
630 goto done_nomem;
631 ret = -EINVAL;
632 if (!ac->secret) {
633 pr_err("no secret set (for auth_x protocol)\n");
634 goto done_nomem;
635 }
636
637 ret = ceph_crypto_key_unarmor(&xi->secret, ac->secret);
638 if (ret)
639 goto done_nomem;
640
641 xi->starting = true;
642 xi->ticket_handlers = RB_ROOT;
643
644 ac->protocol = CEPH_AUTH_CEPHX;
645 ac->private = xi;
646 ac->ops = &ceph_x_ops;
647 return 0;
648
649done_nomem:
650 kfree(xi);
651 if (ceph_x_ticketbuf_cachep)
652 kmem_cache_destroy(ceph_x_ticketbuf_cachep);
653 return ret;
654}
655
656
diff --git a/fs/ceph/auth_x.h b/fs/ceph/auth_x.h
new file mode 100644
index 000000000000..ff6f8180e681
--- /dev/null
+++ b/fs/ceph/auth_x.h
@@ -0,0 +1,49 @@
1#ifndef _FS_CEPH_AUTH_X_H
2#define _FS_CEPH_AUTH_X_H
3
4#include <linux/rbtree.h>
5
6#include "crypto.h"
7#include "auth.h"
8#include "auth_x_protocol.h"
9
10/*
11 * Handle ticket for a single service.
12 */
13struct ceph_x_ticket_handler {
14 struct rb_node node;
15 unsigned service;
16
17 struct ceph_crypto_key session_key;
18 struct ceph_timespec validity;
19
20 u64 secret_id;
21 struct ceph_buffer *ticket_blob;
22
23 unsigned long renew_after, expires;
24};
25
26
27struct ceph_x_authorizer {
28 struct ceph_buffer *buf;
29 unsigned service;
30 u64 nonce;
31 char reply_buf[128]; /* big enough for encrypted blob */
32};
33
34struct ceph_x_info {
35 struct ceph_crypto_key secret;
36
37 bool starting;
38 u64 server_challenge;
39
40 unsigned have_keys;
41 struct rb_root ticket_handlers;
42
43 struct ceph_x_authorizer auth_authorizer;
44};
45
46extern int ceph_x_init(struct ceph_auth_client *ac);
47
48#endif
49
diff --git a/fs/ceph/auth_x_protocol.h b/fs/ceph/auth_x_protocol.h
new file mode 100644
index 000000000000..671d30576c4f
--- /dev/null
+++ b/fs/ceph/auth_x_protocol.h
@@ -0,0 +1,90 @@
1#ifndef __FS_CEPH_AUTH_X_PROTOCOL
2#define __FS_CEPH_AUTH_X_PROTOCOL
3
4#define CEPHX_GET_AUTH_SESSION_KEY 0x0100
5#define CEPHX_GET_PRINCIPAL_SESSION_KEY 0x0200
6#define CEPHX_GET_ROTATING_KEY 0x0400
7
8/* common bits */
9struct ceph_x_ticket_blob {
10 __u8 struct_v;
11 __le64 secret_id;
12 __le32 blob_len;
13 char blob[];
14} __attribute__ ((packed));
15
16
17/* common request/reply headers */
18struct ceph_x_request_header {
19 __le16 op;
20} __attribute__ ((packed));
21
22struct ceph_x_reply_header {
23 __le16 op;
24 __le32 result;
25} __attribute__ ((packed));
26
27
28/* authenticate handshake */
29
30/* initial hello (no reply header) */
31struct ceph_x_server_challenge {
32 __u8 struct_v;
33 __le64 server_challenge;
34} __attribute__ ((packed));
35
36struct ceph_x_authenticate {
37 __u8 struct_v;
38 __le64 client_challenge;
39 __le64 key;
40 /* ticket blob */
41} __attribute__ ((packed));
42
43struct ceph_x_service_ticket_request {
44 __u8 struct_v;
45 __le32 keys;
46} __attribute__ ((packed));
47
48struct ceph_x_challenge_blob {
49 __le64 server_challenge;
50 __le64 client_challenge;
51} __attribute__ ((packed));
52
53
54
55/* authorize handshake */
56
57/*
58 * The authorizer consists of two pieces:
59 * a - service id, ticket blob
60 * b - encrypted with session key
61 */
62struct ceph_x_authorize_a {
63 __u8 struct_v;
64 __le64 global_id;
65 __le32 service_id;
66 struct ceph_x_ticket_blob ticket_blob;
67} __attribute__ ((packed));
68
69struct ceph_x_authorize_b {
70 __u8 struct_v;
71 __le64 nonce;
72} __attribute__ ((packed));
73
74struct ceph_x_authorize_reply {
75 __u8 struct_v;
76 __le64 nonce_plus_one;
77} __attribute__ ((packed));
78
79
80/*
81 * encyption bundle
82 */
83#define CEPHX_ENC_MAGIC 0xff009cad8826aa55ull
84
85struct ceph_x_encrypt_header {
86 __u8 struct_v;
87 __le64 magic;
88} __attribute__ ((packed));
89
90#endif
diff --git a/fs/ceph/buffer.c b/fs/ceph/buffer.c
new file mode 100644
index 000000000000..b98086c7aeba
--- /dev/null
+++ b/fs/ceph/buffer.c
@@ -0,0 +1,78 @@
1
2#include "ceph_debug.h"
3#include "buffer.h"
4#include "decode.h"
5
6struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
7{
8 struct ceph_buffer *b;
9
10 b = kmalloc(sizeof(*b), gfp);
11 if (!b)
12 return NULL;
13
14 b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
15 if (b->vec.iov_base) {
16 b->is_vmalloc = false;
17 } else {
18 b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
19 if (!b->vec.iov_base) {
20 kfree(b);
21 return NULL;
22 }
23 b->is_vmalloc = true;
24 }
25
26 kref_init(&b->kref);
27 b->alloc_len = len;
28 b->vec.iov_len = len;
29 dout("buffer_new %p\n", b);
30 return b;
31}
32
33void ceph_buffer_release(struct kref *kref)
34{
35 struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref);
36
37 dout("buffer_release %p\n", b);
38 if (b->vec.iov_base) {
39 if (b->is_vmalloc)
40 vfree(b->vec.iov_base);
41 else
42 kfree(b->vec.iov_base);
43 }
44 kfree(b);
45}
46
47int ceph_buffer_alloc(struct ceph_buffer *b, int len, gfp_t gfp)
48{
49 b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
50 if (b->vec.iov_base) {
51 b->is_vmalloc = false;
52 } else {
53 b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
54 b->is_vmalloc = true;
55 }
56 if (!b->vec.iov_base)
57 return -ENOMEM;
58 b->alloc_len = len;
59 b->vec.iov_len = len;
60 return 0;
61}
62
63int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end)
64{
65 size_t len;
66
67 ceph_decode_need(p, end, sizeof(u32), bad);
68 len = ceph_decode_32(p);
69 dout("decode_buffer len %d\n", (int)len);
70 ceph_decode_need(p, end, len, bad);
71 *b = ceph_buffer_new(len, GFP_NOFS);
72 if (!*b)
73 return -ENOMEM;
74 ceph_decode_copy(p, (*b)->vec.iov_base, len);
75 return 0;
76bad:
77 return -EINVAL;
78}
diff --git a/fs/ceph/buffer.h b/fs/ceph/buffer.h
new file mode 100644
index 000000000000..58d19014068f
--- /dev/null
+++ b/fs/ceph/buffer.h
@@ -0,0 +1,39 @@
1#ifndef __FS_CEPH_BUFFER_H
2#define __FS_CEPH_BUFFER_H
3
4#include <linux/kref.h>
5#include <linux/mm.h>
6#include <linux/vmalloc.h>
7#include <linux/types.h>
8#include <linux/uio.h>
9
10/*
11 * a simple reference counted buffer.
12 *
13 * use kmalloc for small sizes (<= one page), vmalloc for larger
14 * sizes.
15 */
16struct ceph_buffer {
17 struct kref kref;
18 struct kvec vec;
19 size_t alloc_len;
20 bool is_vmalloc;
21};
22
23extern struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp);
24extern void ceph_buffer_release(struct kref *kref);
25
26static inline struct ceph_buffer *ceph_buffer_get(struct ceph_buffer *b)
27{
28 kref_get(&b->kref);
29 return b;
30}
31
32static inline void ceph_buffer_put(struct ceph_buffer *b)
33{
34 kref_put(&b->kref, ceph_buffer_release);
35}
36
37extern int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end);
38
39#endif
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
new file mode 100644
index 000000000000..db122bb357b8
--- /dev/null
+++ b/fs/ceph/caps.c
@@ -0,0 +1,2927 @@
1#include "ceph_debug.h"
2
3#include <linux/fs.h>
4#include <linux/kernel.h>
5#include <linux/sched.h>
6#include <linux/vmalloc.h>
7#include <linux/wait.h>
8#include <linux/writeback.h>
9
10#include "super.h"
11#include "decode.h"
12#include "messenger.h"
13
14/*
15 * Capability management
16 *
17 * The Ceph metadata servers control client access to inode metadata
18 * and file data by issuing capabilities, granting clients permission
19 * to read and/or write both inode field and file data to OSDs
20 * (storage nodes). Each capability consists of a set of bits
21 * indicating which operations are allowed.
22 *
23 * If the client holds a *_SHARED cap, the client has a coherent value
24 * that can be safely read from the cached inode.
25 *
26 * In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the
27 * client is allowed to change inode attributes (e.g., file size,
28 * mtime), note its dirty state in the ceph_cap, and asynchronously
29 * flush that metadata change to the MDS.
30 *
31 * In the event of a conflicting operation (perhaps by another
32 * client), the MDS will revoke the conflicting client capabilities.
33 *
34 * In order for a client to cache an inode, it must hold a capability
35 * with at least one MDS server. When inodes are released, release
36 * notifications are batched and periodically sent en masse to the MDS
37 * cluster to release server state.
38 */
39
40
41/*
42 * Generate readable cap strings for debugging output.
43 */
44#define MAX_CAP_STR 20
45static char cap_str[MAX_CAP_STR][40];
46static DEFINE_SPINLOCK(cap_str_lock);
47static int last_cap_str;
48
49static char *gcap_string(char *s, int c)
50{
51 if (c & CEPH_CAP_GSHARED)
52 *s++ = 's';
53 if (c & CEPH_CAP_GEXCL)
54 *s++ = 'x';
55 if (c & CEPH_CAP_GCACHE)
56 *s++ = 'c';
57 if (c & CEPH_CAP_GRD)
58 *s++ = 'r';
59 if (c & CEPH_CAP_GWR)
60 *s++ = 'w';
61 if (c & CEPH_CAP_GBUFFER)
62 *s++ = 'b';
63 if (c & CEPH_CAP_GLAZYIO)
64 *s++ = 'l';
65 return s;
66}
67
68const char *ceph_cap_string(int caps)
69{
70 int i;
71 char *s;
72 int c;
73
74 spin_lock(&cap_str_lock);
75 i = last_cap_str++;
76 if (last_cap_str == MAX_CAP_STR)
77 last_cap_str = 0;
78 spin_unlock(&cap_str_lock);
79
80 s = cap_str[i];
81
82 if (caps & CEPH_CAP_PIN)
83 *s++ = 'p';
84
85 c = (caps >> CEPH_CAP_SAUTH) & 3;
86 if (c) {
87 *s++ = 'A';
88 s = gcap_string(s, c);
89 }
90
91 c = (caps >> CEPH_CAP_SLINK) & 3;
92 if (c) {
93 *s++ = 'L';
94 s = gcap_string(s, c);
95 }
96
97 c = (caps >> CEPH_CAP_SXATTR) & 3;
98 if (c) {
99 *s++ = 'X';
100 s = gcap_string(s, c);
101 }
102
103 c = caps >> CEPH_CAP_SFILE;
104 if (c) {
105 *s++ = 'F';
106 s = gcap_string(s, c);
107 }
108
109 if (s == cap_str[i])
110 *s++ = '-';
111 *s = 0;
112 return cap_str[i];
113}
114
115/*
116 * Cap reservations
117 *
118 * Maintain a global pool of preallocated struct ceph_caps, referenced
119 * by struct ceph_caps_reservations. This ensures that we preallocate
120 * memory needed to successfully process an MDS response. (If an MDS
121 * sends us cap information and we fail to process it, we will have
122 * problems due to the client and MDS being out of sync.)
123 *
124 * Reservations are 'owned' by a ceph_cap_reservation context.
125 */
126static spinlock_t caps_list_lock;
127static struct list_head caps_list; /* unused (reserved or unreserved) */
128static int caps_total_count; /* total caps allocated */
129static int caps_use_count; /* in use */
130static int caps_reserve_count; /* unused, reserved */
131static int caps_avail_count; /* unused, unreserved */
132static int caps_min_count; /* keep at least this many (unreserved) */
133
134void __init ceph_caps_init(void)
135{
136 INIT_LIST_HEAD(&caps_list);
137 spin_lock_init(&caps_list_lock);
138}
139
140void ceph_caps_finalize(void)
141{
142 struct ceph_cap *cap;
143
144 spin_lock(&caps_list_lock);
145 while (!list_empty(&caps_list)) {
146 cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
147 list_del(&cap->caps_item);
148 kmem_cache_free(ceph_cap_cachep, cap);
149 }
150 caps_total_count = 0;
151 caps_avail_count = 0;
152 caps_use_count = 0;
153 caps_reserve_count = 0;
154 caps_min_count = 0;
155 spin_unlock(&caps_list_lock);
156}
157
158void ceph_adjust_min_caps(int delta)
159{
160 spin_lock(&caps_list_lock);
161 caps_min_count += delta;
162 BUG_ON(caps_min_count < 0);
163 spin_unlock(&caps_list_lock);
164}
165
166int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need)
167{
168 int i;
169 struct ceph_cap *cap;
170 int have;
171 int alloc = 0;
172 LIST_HEAD(newcaps);
173 int ret = 0;
174
175 dout("reserve caps ctx=%p need=%d\n", ctx, need);
176
177 /* first reserve any caps that are already allocated */
178 spin_lock(&caps_list_lock);
179 if (caps_avail_count >= need)
180 have = need;
181 else
182 have = caps_avail_count;
183 caps_avail_count -= have;
184 caps_reserve_count += have;
185 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
186 caps_avail_count);
187 spin_unlock(&caps_list_lock);
188
189 for (i = have; i < need; i++) {
190 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
191 if (!cap) {
192 ret = -ENOMEM;
193 goto out_alloc_count;
194 }
195 list_add(&cap->caps_item, &newcaps);
196 alloc++;
197 }
198 BUG_ON(have + alloc != need);
199
200 spin_lock(&caps_list_lock);
201 caps_total_count += alloc;
202 caps_reserve_count += alloc;
203 list_splice(&newcaps, &caps_list);
204
205 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
206 caps_avail_count);
207 spin_unlock(&caps_list_lock);
208
209 ctx->count = need;
210 dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
211 ctx, caps_total_count, caps_use_count, caps_reserve_count,
212 caps_avail_count);
213 return 0;
214
215out_alloc_count:
216 /* we didn't manage to reserve as much as we needed */
217 pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
218 ctx, need, have);
219 return ret;
220}
221
222int ceph_unreserve_caps(struct ceph_cap_reservation *ctx)
223{
224 dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
225 if (ctx->count) {
226 spin_lock(&caps_list_lock);
227 BUG_ON(caps_reserve_count < ctx->count);
228 caps_reserve_count -= ctx->count;
229 caps_avail_count += ctx->count;
230 ctx->count = 0;
231 dout("unreserve caps %d = %d used + %d resv + %d avail\n",
232 caps_total_count, caps_use_count, caps_reserve_count,
233 caps_avail_count);
234 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
235 caps_avail_count);
236 spin_unlock(&caps_list_lock);
237 }
238 return 0;
239}
240
241static struct ceph_cap *get_cap(struct ceph_cap_reservation *ctx)
242{
243 struct ceph_cap *cap = NULL;
244
245 /* temporary, until we do something about cap import/export */
246 if (!ctx)
247 return kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
248
249 spin_lock(&caps_list_lock);
250 dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
251 ctx, ctx->count, caps_total_count, caps_use_count,
252 caps_reserve_count, caps_avail_count);
253 BUG_ON(!ctx->count);
254 BUG_ON(ctx->count > caps_reserve_count);
255 BUG_ON(list_empty(&caps_list));
256
257 ctx->count--;
258 caps_reserve_count--;
259 caps_use_count++;
260
261 cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
262 list_del(&cap->caps_item);
263
264 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
265 caps_avail_count);
266 spin_unlock(&caps_list_lock);
267 return cap;
268}
269
270void ceph_put_cap(struct ceph_cap *cap)
271{
272 spin_lock(&caps_list_lock);
273 dout("put_cap %p %d = %d used + %d resv + %d avail\n",
274 cap, caps_total_count, caps_use_count,
275 caps_reserve_count, caps_avail_count);
276 caps_use_count--;
277 /*
278 * Keep some preallocated caps around (ceph_min_count), to
279 * avoid lots of free/alloc churn.
280 */
281 if (caps_avail_count >= caps_reserve_count + caps_min_count) {
282 caps_total_count--;
283 kmem_cache_free(ceph_cap_cachep, cap);
284 } else {
285 caps_avail_count++;
286 list_add(&cap->caps_item, &caps_list);
287 }
288
289 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
290 caps_avail_count);
291 spin_unlock(&caps_list_lock);
292}
293
294void ceph_reservation_status(struct ceph_client *client,
295 int *total, int *avail, int *used, int *reserved,
296 int *min)
297{
298 if (total)
299 *total = caps_total_count;
300 if (avail)
301 *avail = caps_avail_count;
302 if (used)
303 *used = caps_use_count;
304 if (reserved)
305 *reserved = caps_reserve_count;
306 if (min)
307 *min = caps_min_count;
308}
309
310/*
311 * Find ceph_cap for given mds, if any.
312 *
313 * Called with i_lock held.
314 */
315static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds)
316{
317 struct ceph_cap *cap;
318 struct rb_node *n = ci->i_caps.rb_node;
319
320 while (n) {
321 cap = rb_entry(n, struct ceph_cap, ci_node);
322 if (mds < cap->mds)
323 n = n->rb_left;
324 else if (mds > cap->mds)
325 n = n->rb_right;
326 else
327 return cap;
328 }
329 return NULL;
330}
331
332/*
333 * Return id of any MDS with a cap, preferably FILE_WR|WRBUFFER|EXCL, else
334 * -1.
335 */
336static int __ceph_get_cap_mds(struct ceph_inode_info *ci, u32 *mseq)
337{
338 struct ceph_cap *cap;
339 int mds = -1;
340 struct rb_node *p;
341
342 /* prefer mds with WR|WRBUFFER|EXCL caps */
343 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
344 cap = rb_entry(p, struct ceph_cap, ci_node);
345 mds = cap->mds;
346 if (mseq)
347 *mseq = cap->mseq;
348 if (cap->issued & (CEPH_CAP_FILE_WR |
349 CEPH_CAP_FILE_BUFFER |
350 CEPH_CAP_FILE_EXCL))
351 break;
352 }
353 return mds;
354}
355
356int ceph_get_cap_mds(struct inode *inode)
357{
358 int mds;
359 spin_lock(&inode->i_lock);
360 mds = __ceph_get_cap_mds(ceph_inode(inode), NULL);
361 spin_unlock(&inode->i_lock);
362 return mds;
363}
364
365/*
366 * Called under i_lock.
367 */
368static void __insert_cap_node(struct ceph_inode_info *ci,
369 struct ceph_cap *new)
370{
371 struct rb_node **p = &ci->i_caps.rb_node;
372 struct rb_node *parent = NULL;
373 struct ceph_cap *cap = NULL;
374
375 while (*p) {
376 parent = *p;
377 cap = rb_entry(parent, struct ceph_cap, ci_node);
378 if (new->mds < cap->mds)
379 p = &(*p)->rb_left;
380 else if (new->mds > cap->mds)
381 p = &(*p)->rb_right;
382 else
383 BUG();
384 }
385
386 rb_link_node(&new->ci_node, parent, p);
387 rb_insert_color(&new->ci_node, &ci->i_caps);
388}
389
390/*
391 * (re)set cap hold timeouts, which control the delayed release
392 * of unused caps back to the MDS. Should be called on cap use.
393 */
394static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
395 struct ceph_inode_info *ci)
396{
397 struct ceph_mount_args *ma = mdsc->client->mount_args;
398
399 ci->i_hold_caps_min = round_jiffies(jiffies +
400 ma->caps_wanted_delay_min * HZ);
401 ci->i_hold_caps_max = round_jiffies(jiffies +
402 ma->caps_wanted_delay_max * HZ);
403 dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
404 ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
405}
406
407/*
408 * (Re)queue cap at the end of the delayed cap release list.
409 *
410 * If I_FLUSH is set, leave the inode at the front of the list.
411 *
412 * Caller holds i_lock
413 * -> we take mdsc->cap_delay_lock
414 */
415static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
416 struct ceph_inode_info *ci)
417{
418 __cap_set_timeouts(mdsc, ci);
419 dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
420 ci->i_ceph_flags, ci->i_hold_caps_max);
421 if (!mdsc->stopping) {
422 spin_lock(&mdsc->cap_delay_lock);
423 if (!list_empty(&ci->i_cap_delay_list)) {
424 if (ci->i_ceph_flags & CEPH_I_FLUSH)
425 goto no_change;
426 list_del_init(&ci->i_cap_delay_list);
427 }
428 list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
429no_change:
430 spin_unlock(&mdsc->cap_delay_lock);
431 }
432}
433
434/*
435 * Queue an inode for immediate writeback. Mark inode with I_FLUSH,
436 * indicating we should send a cap message to flush dirty metadata
437 * asap, and move to the front of the delayed cap list.
438 */
439static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
440 struct ceph_inode_info *ci)
441{
442 dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode);
443 spin_lock(&mdsc->cap_delay_lock);
444 ci->i_ceph_flags |= CEPH_I_FLUSH;
445 if (!list_empty(&ci->i_cap_delay_list))
446 list_del_init(&ci->i_cap_delay_list);
447 list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
448 spin_unlock(&mdsc->cap_delay_lock);
449}
450
451/*
452 * Cancel delayed work on cap.
453 *
454 * Caller must hold i_lock.
455 */
456static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
457 struct ceph_inode_info *ci)
458{
459 dout("__cap_delay_cancel %p\n", &ci->vfs_inode);
460 if (list_empty(&ci->i_cap_delay_list))
461 return;
462 spin_lock(&mdsc->cap_delay_lock);
463 list_del_init(&ci->i_cap_delay_list);
464 spin_unlock(&mdsc->cap_delay_lock);
465}
466
467/*
468 * Common issue checks for add_cap, handle_cap_grant.
469 */
470static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
471 unsigned issued)
472{
473 unsigned had = __ceph_caps_issued(ci, NULL);
474
475 /*
476 * Each time we receive FILE_CACHE anew, we increment
477 * i_rdcache_gen.
478 */
479 if ((issued & CEPH_CAP_FILE_CACHE) &&
480 (had & CEPH_CAP_FILE_CACHE) == 0)
481 ci->i_rdcache_gen++;
482
483 /*
484 * if we are newly issued FILE_SHARED, clear I_COMPLETE; we
485 * don't know what happened to this directory while we didn't
486 * have the cap.
487 */
488 if ((issued & CEPH_CAP_FILE_SHARED) &&
489 (had & CEPH_CAP_FILE_SHARED) == 0) {
490 ci->i_shared_gen++;
491 if (S_ISDIR(ci->vfs_inode.i_mode)) {
492 dout(" marking %p NOT complete\n", &ci->vfs_inode);
493 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
494 }
495 }
496}
497
498/*
499 * Add a capability under the given MDS session.
500 *
501 * Caller should hold session snap_rwsem (read) and s_mutex.
502 *
503 * @fmode is the open file mode, if we are opening a file, otherwise
504 * it is < 0. (This is so we can atomically add the cap and add an
505 * open file reference to it.)
506 */
507int ceph_add_cap(struct inode *inode,
508 struct ceph_mds_session *session, u64 cap_id,
509 int fmode, unsigned issued, unsigned wanted,
510 unsigned seq, unsigned mseq, u64 realmino, int flags,
511 struct ceph_cap_reservation *caps_reservation)
512{
513 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
514 struct ceph_inode_info *ci = ceph_inode(inode);
515 struct ceph_cap *new_cap = NULL;
516 struct ceph_cap *cap;
517 int mds = session->s_mds;
518 int actual_wanted;
519
520 dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
521 session->s_mds, cap_id, ceph_cap_string(issued), seq);
522
523 /*
524 * If we are opening the file, include file mode wanted bits
525 * in wanted.
526 */
527 if (fmode >= 0)
528 wanted |= ceph_caps_for_mode(fmode);
529
530retry:
531 spin_lock(&inode->i_lock);
532 cap = __get_cap_for_mds(ci, mds);
533 if (!cap) {
534 if (new_cap) {
535 cap = new_cap;
536 new_cap = NULL;
537 } else {
538 spin_unlock(&inode->i_lock);
539 new_cap = get_cap(caps_reservation);
540 if (new_cap == NULL)
541 return -ENOMEM;
542 goto retry;
543 }
544
545 cap->issued = 0;
546 cap->implemented = 0;
547 cap->mds = mds;
548 cap->mds_wanted = 0;
549
550 cap->ci = ci;
551 __insert_cap_node(ci, cap);
552
553 /* clear out old exporting info? (i.e. on cap import) */
554 if (ci->i_cap_exporting_mds == mds) {
555 ci->i_cap_exporting_issued = 0;
556 ci->i_cap_exporting_mseq = 0;
557 ci->i_cap_exporting_mds = -1;
558 }
559
560 /* add to session cap list */
561 cap->session = session;
562 spin_lock(&session->s_cap_lock);
563 list_add_tail(&cap->session_caps, &session->s_caps);
564 session->s_nr_caps++;
565 spin_unlock(&session->s_cap_lock);
566 }
567
568 if (!ci->i_snap_realm) {
569 /*
570 * add this inode to the appropriate snap realm
571 */
572 struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,
573 realmino);
574 if (realm) {
575 ceph_get_snap_realm(mdsc, realm);
576 spin_lock(&realm->inodes_with_caps_lock);
577 ci->i_snap_realm = realm;
578 list_add(&ci->i_snap_realm_item,
579 &realm->inodes_with_caps);
580 spin_unlock(&realm->inodes_with_caps_lock);
581 } else {
582 pr_err("ceph_add_cap: couldn't find snap realm %llx\n",
583 realmino);
584 }
585 }
586
587 __check_cap_issue(ci, cap, issued);
588
589 /*
590 * If we are issued caps we don't want, or the mds' wanted
591 * value appears to be off, queue a check so we'll release
592 * later and/or update the mds wanted value.
593 */
594 actual_wanted = __ceph_caps_wanted(ci);
595 if ((wanted & ~actual_wanted) ||
596 (issued & ~actual_wanted & CEPH_CAP_ANY_WR)) {
597 dout(" issued %s, mds wanted %s, actual %s, queueing\n",
598 ceph_cap_string(issued), ceph_cap_string(wanted),
599 ceph_cap_string(actual_wanted));
600 __cap_delay_requeue(mdsc, ci);
601 }
602
603 if (flags & CEPH_CAP_FLAG_AUTH)
604 ci->i_auth_cap = cap;
605 else if (ci->i_auth_cap == cap)
606 ci->i_auth_cap = NULL;
607
608 dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
609 inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
610 ceph_cap_string(issued|cap->issued), seq, mds);
611 cap->cap_id = cap_id;
612 cap->issued = issued;
613 cap->implemented |= issued;
614 cap->mds_wanted |= wanted;
615 cap->seq = seq;
616 cap->issue_seq = seq;
617 cap->mseq = mseq;
618 cap->cap_gen = session->s_cap_gen;
619
620 if (fmode >= 0)
621 __ceph_get_fmode(ci, fmode);
622 spin_unlock(&inode->i_lock);
623 wake_up(&ci->i_cap_wq);
624 return 0;
625}
626
627/*
628 * Return true if cap has not timed out and belongs to the current
629 * generation of the MDS session (i.e. has not gone 'stale' due to
630 * us losing touch with the mds).
631 */
632static int __cap_is_valid(struct ceph_cap *cap)
633{
634 unsigned long ttl;
635 u32 gen;
636
637 spin_lock(&cap->session->s_cap_lock);
638 gen = cap->session->s_cap_gen;
639 ttl = cap->session->s_cap_ttl;
640 spin_unlock(&cap->session->s_cap_lock);
641
642 if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {
643 dout("__cap_is_valid %p cap %p issued %s "
644 "but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode,
645 cap, ceph_cap_string(cap->issued), cap->cap_gen, gen);
646 return 0;
647 }
648
649 return 1;
650}
651
652/*
653 * Return set of valid cap bits issued to us. Note that caps time
654 * out, and may be invalidated in bulk if the client session times out
655 * and session->s_cap_gen is bumped.
656 */
657int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
658{
659 int have = ci->i_snap_caps | ci->i_cap_exporting_issued;
660 struct ceph_cap *cap;
661 struct rb_node *p;
662
663 if (implemented)
664 *implemented = 0;
665 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
666 cap = rb_entry(p, struct ceph_cap, ci_node);
667 if (!__cap_is_valid(cap))
668 continue;
669 dout("__ceph_caps_issued %p cap %p issued %s\n",
670 &ci->vfs_inode, cap, ceph_cap_string(cap->issued));
671 have |= cap->issued;
672 if (implemented)
673 *implemented |= cap->implemented;
674 }
675 return have;
676}
677
678/*
679 * Get cap bits issued by caps other than @ocap
680 */
681int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap)
682{
683 int have = ci->i_snap_caps;
684 struct ceph_cap *cap;
685 struct rb_node *p;
686
687 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
688 cap = rb_entry(p, struct ceph_cap, ci_node);
689 if (cap == ocap)
690 continue;
691 if (!__cap_is_valid(cap))
692 continue;
693 have |= cap->issued;
694 }
695 return have;
696}
697
698/*
699 * Move a cap to the end of the LRU (oldest caps at list head, newest
700 * at list tail).
701 */
702static void __touch_cap(struct ceph_cap *cap)
703{
704 struct ceph_mds_session *s = cap->session;
705
706 spin_lock(&s->s_cap_lock);
707 if (s->s_cap_iterator == NULL) {
708 dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap,
709 s->s_mds);
710 list_move_tail(&cap->session_caps, &s->s_caps);
711 } else {
712 dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n",
713 &cap->ci->vfs_inode, cap, s->s_mds);
714 }
715 spin_unlock(&s->s_cap_lock);
716}
717
718/*
719 * Check if we hold the given mask. If so, move the cap(s) to the
720 * front of their respective LRUs. (This is the preferred way for
721 * callers to check for caps they want.)
722 */
723int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
724{
725 struct ceph_cap *cap;
726 struct rb_node *p;
727 int have = ci->i_snap_caps;
728
729 if ((have & mask) == mask) {
730 dout("__ceph_caps_issued_mask %p snap issued %s"
731 " (mask %s)\n", &ci->vfs_inode,
732 ceph_cap_string(have),
733 ceph_cap_string(mask));
734 return 1;
735 }
736
737 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
738 cap = rb_entry(p, struct ceph_cap, ci_node);
739 if (!__cap_is_valid(cap))
740 continue;
741 if ((cap->issued & mask) == mask) {
742 dout("__ceph_caps_issued_mask %p cap %p issued %s"
743 " (mask %s)\n", &ci->vfs_inode, cap,
744 ceph_cap_string(cap->issued),
745 ceph_cap_string(mask));
746 if (touch)
747 __touch_cap(cap);
748 return 1;
749 }
750
751 /* does a combination of caps satisfy mask? */
752 have |= cap->issued;
753 if ((have & mask) == mask) {
754 dout("__ceph_caps_issued_mask %p combo issued %s"
755 " (mask %s)\n", &ci->vfs_inode,
756 ceph_cap_string(cap->issued),
757 ceph_cap_string(mask));
758 if (touch) {
759 struct rb_node *q;
760
761 /* touch this + preceeding caps */
762 __touch_cap(cap);
763 for (q = rb_first(&ci->i_caps); q != p;
764 q = rb_next(q)) {
765 cap = rb_entry(q, struct ceph_cap,
766 ci_node);
767 if (!__cap_is_valid(cap))
768 continue;
769 __touch_cap(cap);
770 }
771 }
772 return 1;
773 }
774 }
775
776 return 0;
777}
778
779/*
780 * Return true if mask caps are currently being revoked by an MDS.
781 */
782int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
783{
784 struct inode *inode = &ci->vfs_inode;
785 struct ceph_cap *cap;
786 struct rb_node *p;
787 int ret = 0;
788
789 spin_lock(&inode->i_lock);
790 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
791 cap = rb_entry(p, struct ceph_cap, ci_node);
792 if (__cap_is_valid(cap) &&
793 (cap->implemented & ~cap->issued & mask)) {
794 ret = 1;
795 break;
796 }
797 }
798 spin_unlock(&inode->i_lock);
799 dout("ceph_caps_revoking %p %s = %d\n", inode,
800 ceph_cap_string(mask), ret);
801 return ret;
802}
803
804int __ceph_caps_used(struct ceph_inode_info *ci)
805{
806 int used = 0;
807 if (ci->i_pin_ref)
808 used |= CEPH_CAP_PIN;
809 if (ci->i_rd_ref)
810 used |= CEPH_CAP_FILE_RD;
811 if (ci->i_rdcache_ref || ci->i_rdcache_gen)
812 used |= CEPH_CAP_FILE_CACHE;
813 if (ci->i_wr_ref)
814 used |= CEPH_CAP_FILE_WR;
815 if (ci->i_wrbuffer_ref)
816 used |= CEPH_CAP_FILE_BUFFER;
817 return used;
818}
819
820/*
821 * wanted, by virtue of open file modes
822 */
823int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
824{
825 int want = 0;
826 int mode;
827 for (mode = 0; mode < 4; mode++)
828 if (ci->i_nr_by_mode[mode])
829 want |= ceph_caps_for_mode(mode);
830 return want;
831}
832
833/*
834 * Return caps we have registered with the MDS(s) as 'wanted'.
835 */
836int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
837{
838 struct ceph_cap *cap;
839 struct rb_node *p;
840 int mds_wanted = 0;
841
842 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
843 cap = rb_entry(p, struct ceph_cap, ci_node);
844 if (!__cap_is_valid(cap))
845 continue;
846 mds_wanted |= cap->mds_wanted;
847 }
848 return mds_wanted;
849}
850
851/*
852 * called under i_lock
853 */
854static int __ceph_is_any_caps(struct ceph_inode_info *ci)
855{
856 return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_mds >= 0;
857}
858
859/*
860 * caller should hold i_lock.
861 * caller will not hold session s_mutex if called from destroy_inode.
862 */
863void __ceph_remove_cap(struct ceph_cap *cap)
864{
865 struct ceph_mds_session *session = cap->session;
866 struct ceph_inode_info *ci = cap->ci;
867 struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
868
869 dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
870
871 /* remove from inode list */
872 rb_erase(&cap->ci_node, &ci->i_caps);
873 cap->ci = NULL;
874 if (ci->i_auth_cap == cap)
875 ci->i_auth_cap = NULL;
876
877 /* remove from session list */
878 spin_lock(&session->s_cap_lock);
879 if (session->s_cap_iterator == cap) {
880 /* not yet, we are iterating over this very cap */
881 dout("__ceph_remove_cap delaying %p removal from session %p\n",
882 cap, cap->session);
883 } else {
884 list_del_init(&cap->session_caps);
885 session->s_nr_caps--;
886 cap->session = NULL;
887 }
888 spin_unlock(&session->s_cap_lock);
889
890 if (cap->session == NULL)
891 ceph_put_cap(cap);
892
893 if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) {
894 struct ceph_snap_realm *realm = ci->i_snap_realm;
895 spin_lock(&realm->inodes_with_caps_lock);
896 list_del_init(&ci->i_snap_realm_item);
897 ci->i_snap_realm_counter++;
898 ci->i_snap_realm = NULL;
899 spin_unlock(&realm->inodes_with_caps_lock);
900 ceph_put_snap_realm(mdsc, realm);
901 }
902 if (!__ceph_is_any_real_caps(ci))
903 __cap_delay_cancel(mdsc, ci);
904}
905
906/*
907 * Build and send a cap message to the given MDS.
908 *
909 * Caller should be holding s_mutex.
910 */
911static int send_cap_msg(struct ceph_mds_session *session,
912 u64 ino, u64 cid, int op,
913 int caps, int wanted, int dirty,
914 u32 seq, u64 flush_tid, u32 issue_seq, u32 mseq,
915 u64 size, u64 max_size,
916 struct timespec *mtime, struct timespec *atime,
917 u64 time_warp_seq,
918 uid_t uid, gid_t gid, mode_t mode,
919 u64 xattr_version,
920 struct ceph_buffer *xattrs_buf,
921 u64 follows)
922{
923 struct ceph_mds_caps *fc;
924 struct ceph_msg *msg;
925
926 dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
927 " seq %u/%u mseq %u follows %lld size %llu/%llu"
928 " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op),
929 cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted),
930 ceph_cap_string(dirty),
931 seq, issue_seq, mseq, follows, size, max_size,
932 xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
933
934 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), 0, 0, NULL);
935 if (IS_ERR(msg))
936 return PTR_ERR(msg);
937
938 msg->hdr.tid = cpu_to_le64(flush_tid);
939
940 fc = msg->front.iov_base;
941 memset(fc, 0, sizeof(*fc));
942
943 fc->cap_id = cpu_to_le64(cid);
944 fc->op = cpu_to_le32(op);
945 fc->seq = cpu_to_le32(seq);
946 fc->issue_seq = cpu_to_le32(issue_seq);
947 fc->migrate_seq = cpu_to_le32(mseq);
948 fc->caps = cpu_to_le32(caps);
949 fc->wanted = cpu_to_le32(wanted);
950 fc->dirty = cpu_to_le32(dirty);
951 fc->ino = cpu_to_le64(ino);
952 fc->snap_follows = cpu_to_le64(follows);
953
954 fc->size = cpu_to_le64(size);
955 fc->max_size = cpu_to_le64(max_size);
956 if (mtime)
957 ceph_encode_timespec(&fc->mtime, mtime);
958 if (atime)
959 ceph_encode_timespec(&fc->atime, atime);
960 fc->time_warp_seq = cpu_to_le32(time_warp_seq);
961
962 fc->uid = cpu_to_le32(uid);
963 fc->gid = cpu_to_le32(gid);
964 fc->mode = cpu_to_le32(mode);
965
966 fc->xattr_version = cpu_to_le64(xattr_version);
967 if (xattrs_buf) {
968 msg->middle = ceph_buffer_get(xattrs_buf);
969 fc->xattr_len = cpu_to_le32(xattrs_buf->vec.iov_len);
970 msg->hdr.middle_len = cpu_to_le32(xattrs_buf->vec.iov_len);
971 }
972
973 ceph_con_send(&session->s_con, msg);
974 return 0;
975}
976
977/*
978 * Queue cap releases when an inode is dropped from our cache. Since
979 * inode is about to be destroyed, there is no need for i_lock.
980 */
981void ceph_queue_caps_release(struct inode *inode)
982{
983 struct ceph_inode_info *ci = ceph_inode(inode);
984 struct rb_node *p;
985
986 p = rb_first(&ci->i_caps);
987 while (p) {
988 struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
989 struct ceph_mds_session *session = cap->session;
990 struct ceph_msg *msg;
991 struct ceph_mds_cap_release *head;
992 struct ceph_mds_cap_item *item;
993
994 spin_lock(&session->s_cap_lock);
995 BUG_ON(!session->s_num_cap_releases);
996 msg = list_first_entry(&session->s_cap_releases,
997 struct ceph_msg, list_head);
998
999 dout(" adding %p release to mds%d msg %p (%d left)\n",
1000 inode, session->s_mds, msg, session->s_num_cap_releases);
1001
1002 BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
1003 head = msg->front.iov_base;
1004 head->num = cpu_to_le32(le32_to_cpu(head->num) + 1);
1005 item = msg->front.iov_base + msg->front.iov_len;
1006 item->ino = cpu_to_le64(ceph_ino(inode));
1007 item->cap_id = cpu_to_le64(cap->cap_id);
1008 item->migrate_seq = cpu_to_le32(cap->mseq);
1009 item->seq = cpu_to_le32(cap->issue_seq);
1010
1011 session->s_num_cap_releases--;
1012
1013 msg->front.iov_len += sizeof(*item);
1014 if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
1015 dout(" release msg %p full\n", msg);
1016 list_move_tail(&msg->list_head,
1017 &session->s_cap_releases_done);
1018 } else {
1019 dout(" release msg %p at %d/%d (%d)\n", msg,
1020 (int)le32_to_cpu(head->num),
1021 (int)CEPH_CAPS_PER_RELEASE,
1022 (int)msg->front.iov_len);
1023 }
1024 spin_unlock(&session->s_cap_lock);
1025 p = rb_next(p);
1026 __ceph_remove_cap(cap);
1027 }
1028}
1029
1030/*
1031 * Send a cap msg on the given inode. Update our caps state, then
1032 * drop i_lock and send the message.
1033 *
1034 * Make note of max_size reported/requested from mds, revoked caps
1035 * that have now been implemented.
1036 *
1037 * Make half-hearted attempt ot to invalidate page cache if we are
1038 * dropping RDCACHE. Note that this will leave behind locked pages
1039 * that we'll then need to deal with elsewhere.
1040 *
1041 * Return non-zero if delayed release, or we experienced an error
1042 * such that the caller should requeue + retry later.
1043 *
1044 * called with i_lock, then drops it.
1045 * caller should hold snap_rwsem (read), s_mutex.
1046 */
1047static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
1048 int op, int used, int want, int retain, int flushing,
1049 unsigned *pflush_tid)
1050 __releases(cap->ci->vfs_inode->i_lock)
1051{
1052 struct ceph_inode_info *ci = cap->ci;
1053 struct inode *inode = &ci->vfs_inode;
1054 u64 cap_id = cap->cap_id;
1055 int held, revoking, dropping, keep;
1056 u64 seq, issue_seq, mseq, time_warp_seq, follows;
1057 u64 size, max_size;
1058 struct timespec mtime, atime;
1059 int wake = 0;
1060 mode_t mode;
1061 uid_t uid;
1062 gid_t gid;
1063 struct ceph_mds_session *session;
1064 u64 xattr_version = 0;
1065 int delayed = 0;
1066 u64 flush_tid = 0;
1067 int i;
1068 int ret;
1069
1070 held = cap->issued | cap->implemented;
1071 revoking = cap->implemented & ~cap->issued;
1072 retain &= ~revoking;
1073 dropping = cap->issued & ~retain;
1074
1075 dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n",
1076 inode, cap, cap->session,
1077 ceph_cap_string(held), ceph_cap_string(held & retain),
1078 ceph_cap_string(revoking));
1079 BUG_ON((retain & CEPH_CAP_PIN) == 0);
1080
1081 session = cap->session;
1082
1083 /* don't release wanted unless we've waited a bit. */
1084 if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
1085 time_before(jiffies, ci->i_hold_caps_min)) {
1086 dout(" delaying issued %s -> %s, wanted %s -> %s on send\n",
1087 ceph_cap_string(cap->issued),
1088 ceph_cap_string(cap->issued & retain),
1089 ceph_cap_string(cap->mds_wanted),
1090 ceph_cap_string(want));
1091 want |= cap->mds_wanted;
1092 retain |= cap->issued;
1093 delayed = 1;
1094 }
1095 ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH);
1096
1097 cap->issued &= retain; /* drop bits we don't want */
1098 if (cap->implemented & ~cap->issued) {
1099 /*
1100 * Wake up any waiters on wanted -> needed transition.
1101 * This is due to the weird transition from buffered
1102 * to sync IO... we need to flush dirty pages _before_
1103 * allowing sync writes to avoid reordering.
1104 */
1105 wake = 1;
1106 }
1107 cap->implemented &= cap->issued | used;
1108 cap->mds_wanted = want;
1109
1110 if (flushing) {
1111 /*
1112 * assign a tid for flush operations so we can avoid
1113 * flush1 -> dirty1 -> flush2 -> flushack1 -> mark
1114 * clean type races. track latest tid for every bit
1115 * so we can handle flush AxFw, flush Fw, and have the
1116 * first ack clean Ax.
1117 */
1118 flush_tid = ++ci->i_cap_flush_last_tid;
1119 if (pflush_tid)
1120 *pflush_tid = flush_tid;
1121 dout(" cap_flush_tid %d\n", (int)flush_tid);
1122 for (i = 0; i < CEPH_CAP_BITS; i++)
1123 if (flushing & (1 << i))
1124 ci->i_cap_flush_tid[i] = flush_tid;
1125 }
1126
1127 keep = cap->implemented;
1128 seq = cap->seq;
1129 issue_seq = cap->issue_seq;
1130 mseq = cap->mseq;
1131 size = inode->i_size;
1132 ci->i_reported_size = size;
1133 max_size = ci->i_wanted_max_size;
1134 ci->i_requested_max_size = max_size;
1135 mtime = inode->i_mtime;
1136 atime = inode->i_atime;
1137 time_warp_seq = ci->i_time_warp_seq;
1138 follows = ci->i_snap_realm->cached_context->seq;
1139 uid = inode->i_uid;
1140 gid = inode->i_gid;
1141 mode = inode->i_mode;
1142
1143 if (dropping & CEPH_CAP_XATTR_EXCL) {
1144 __ceph_build_xattrs_blob(ci);
1145 xattr_version = ci->i_xattrs.version + 1;
1146 }
1147
1148 spin_unlock(&inode->i_lock);
1149
1150 ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
1151 op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,
1152 size, max_size, &mtime, &atime, time_warp_seq,
1153 uid, gid, mode,
1154 xattr_version,
1155 (flushing & CEPH_CAP_XATTR_EXCL) ? ci->i_xattrs.blob : NULL,
1156 follows);
1157 if (ret < 0) {
1158 dout("error sending cap msg, must requeue %p\n", inode);
1159 delayed = 1;
1160 }
1161
1162 if (wake)
1163 wake_up(&ci->i_cap_wq);
1164
1165 return delayed;
1166}
1167
1168/*
1169 * When a snapshot is taken, clients accumulate dirty metadata on
1170 * inodes with capabilities in ceph_cap_snaps to describe the file
1171 * state at the time the snapshot was taken. This must be flushed
1172 * asynchronously back to the MDS once sync writes complete and dirty
1173 * data is written out.
1174 *
1175 * Called under i_lock. Takes s_mutex as needed.
1176 */
1177void __ceph_flush_snaps(struct ceph_inode_info *ci,
1178 struct ceph_mds_session **psession)
1179{
1180 struct inode *inode = &ci->vfs_inode;
1181 int mds;
1182 struct ceph_cap_snap *capsnap;
1183 u32 mseq;
1184 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
1185 struct ceph_mds_session *session = NULL; /* if session != NULL, we hold
1186 session->s_mutex */
1187 u64 next_follows = 0; /* keep track of how far we've gotten through the
1188 i_cap_snaps list, and skip these entries next time
1189 around to avoid an infinite loop */
1190
1191 if (psession)
1192 session = *psession;
1193
1194 dout("__flush_snaps %p\n", inode);
1195retry:
1196 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
1197 /* avoid an infiniute loop after retry */
1198 if (capsnap->follows < next_follows)
1199 continue;
1200 /*
1201 * we need to wait for sync writes to complete and for dirty
1202 * pages to be written out.
1203 */
1204 if (capsnap->dirty_pages || capsnap->writing)
1205 continue;
1206
1207 /* pick mds, take s_mutex */
1208 mds = __ceph_get_cap_mds(ci, &mseq);
1209 if (session && session->s_mds != mds) {
1210 dout("oops, wrong session %p mutex\n", session);
1211 mutex_unlock(&session->s_mutex);
1212 ceph_put_mds_session(session);
1213 session = NULL;
1214 }
1215 if (!session) {
1216 spin_unlock(&inode->i_lock);
1217 mutex_lock(&mdsc->mutex);
1218 session = __ceph_lookup_mds_session(mdsc, mds);
1219 mutex_unlock(&mdsc->mutex);
1220 if (session) {
1221 dout("inverting session/ino locks on %p\n",
1222 session);
1223 mutex_lock(&session->s_mutex);
1224 }
1225 /*
1226 * if session == NULL, we raced against a cap
1227 * deletion. retry, and we'll get a better
1228 * @mds value next time.
1229 */
1230 spin_lock(&inode->i_lock);
1231 goto retry;
1232 }
1233
1234 capsnap->flush_tid = ++ci->i_cap_flush_last_tid;
1235 atomic_inc(&capsnap->nref);
1236 if (!list_empty(&capsnap->flushing_item))
1237 list_del_init(&capsnap->flushing_item);
1238 list_add_tail(&capsnap->flushing_item,
1239 &session->s_cap_snaps_flushing);
1240 spin_unlock(&inode->i_lock);
1241
1242 dout("flush_snaps %p cap_snap %p follows %lld size %llu\n",
1243 inode, capsnap, next_follows, capsnap->size);
1244 send_cap_msg(session, ceph_vino(inode).ino, 0,
1245 CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
1246 capsnap->dirty, 0, capsnap->flush_tid, 0, mseq,
1247 capsnap->size, 0,
1248 &capsnap->mtime, &capsnap->atime,
1249 capsnap->time_warp_seq,
1250 capsnap->uid, capsnap->gid, capsnap->mode,
1251 0, NULL,
1252 capsnap->follows);
1253
1254 next_follows = capsnap->follows + 1;
1255 ceph_put_cap_snap(capsnap);
1256
1257 spin_lock(&inode->i_lock);
1258 goto retry;
1259 }
1260
1261 /* we flushed them all; remove this inode from the queue */
1262 spin_lock(&mdsc->snap_flush_lock);
1263 list_del_init(&ci->i_snap_flush_item);
1264 spin_unlock(&mdsc->snap_flush_lock);
1265
1266 if (psession)
1267 *psession = session;
1268 else if (session) {
1269 mutex_unlock(&session->s_mutex);
1270 ceph_put_mds_session(session);
1271 }
1272}
1273
1274static void ceph_flush_snaps(struct ceph_inode_info *ci)
1275{
1276 struct inode *inode = &ci->vfs_inode;
1277
1278 spin_lock(&inode->i_lock);
1279 __ceph_flush_snaps(ci, NULL);
1280 spin_unlock(&inode->i_lock);
1281}
1282
1283/*
1284 * Mark caps dirty. If inode is newly dirty, add to the global dirty
1285 * list.
1286 */
1287void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
1288{
1289 struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
1290 struct inode *inode = &ci->vfs_inode;
1291 int was = ci->i_dirty_caps;
1292 int dirty = 0;
1293
1294 dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode,
1295 ceph_cap_string(mask), ceph_cap_string(was),
1296 ceph_cap_string(was | mask));
1297 ci->i_dirty_caps |= mask;
1298 if (was == 0) {
1299 dout(" inode %p now dirty\n", &ci->vfs_inode);
1300 BUG_ON(!list_empty(&ci->i_dirty_item));
1301 spin_lock(&mdsc->cap_dirty_lock);
1302 list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
1303 spin_unlock(&mdsc->cap_dirty_lock);
1304 if (ci->i_flushing_caps == 0) {
1305 igrab(inode);
1306 dirty |= I_DIRTY_SYNC;
1307 }
1308 }
1309 BUG_ON(list_empty(&ci->i_dirty_item));
1310 if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
1311 (mask & CEPH_CAP_FILE_BUFFER))
1312 dirty |= I_DIRTY_DATASYNC;
1313 if (dirty)
1314 __mark_inode_dirty(inode, dirty);
1315 __cap_delay_requeue(mdsc, ci);
1316}
1317
1318/*
1319 * Add dirty inode to the flushing list. Assigned a seq number so we
1320 * can wait for caps to flush without starving.
1321 *
1322 * Called under i_lock.
1323 */
1324static int __mark_caps_flushing(struct inode *inode,
1325 struct ceph_mds_session *session)
1326{
1327 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
1328 struct ceph_inode_info *ci = ceph_inode(inode);
1329 int flushing;
1330
1331 BUG_ON(ci->i_dirty_caps == 0);
1332 BUG_ON(list_empty(&ci->i_dirty_item));
1333
1334 flushing = ci->i_dirty_caps;
1335 dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n",
1336 ceph_cap_string(flushing),
1337 ceph_cap_string(ci->i_flushing_caps),
1338 ceph_cap_string(ci->i_flushing_caps | flushing));
1339 ci->i_flushing_caps |= flushing;
1340 ci->i_dirty_caps = 0;
1341 dout(" inode %p now !dirty\n", inode);
1342
1343 spin_lock(&mdsc->cap_dirty_lock);
1344 list_del_init(&ci->i_dirty_item);
1345
1346 ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
1347 if (list_empty(&ci->i_flushing_item)) {
1348 list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
1349 mdsc->num_cap_flushing++;
1350 dout(" inode %p now flushing seq %lld\n", inode,
1351 ci->i_cap_flush_seq);
1352 } else {
1353 list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
1354 dout(" inode %p now flushing (more) seq %lld\n", inode,
1355 ci->i_cap_flush_seq);
1356 }
1357 spin_unlock(&mdsc->cap_dirty_lock);
1358
1359 return flushing;
1360}
1361
1362/*
1363 * try to invalidate mapping pages without blocking.
1364 */
1365static int mapping_is_empty(struct address_space *mapping)
1366{
1367 struct page *page = find_get_page(mapping, 0);
1368
1369 if (!page)
1370 return 1;
1371
1372 put_page(page);
1373 return 0;
1374}
1375
1376static int try_nonblocking_invalidate(struct inode *inode)
1377{
1378 struct ceph_inode_info *ci = ceph_inode(inode);
1379 u32 invalidating_gen = ci->i_rdcache_gen;
1380
1381 spin_unlock(&inode->i_lock);
1382 invalidate_mapping_pages(&inode->i_data, 0, -1);
1383 spin_lock(&inode->i_lock);
1384
1385 if (mapping_is_empty(&inode->i_data) &&
1386 invalidating_gen == ci->i_rdcache_gen) {
1387 /* success. */
1388 dout("try_nonblocking_invalidate %p success\n", inode);
1389 ci->i_rdcache_gen = 0;
1390 ci->i_rdcache_revoking = 0;
1391 return 0;
1392 }
1393 dout("try_nonblocking_invalidate %p failed\n", inode);
1394 return -1;
1395}
1396
1397/*
1398 * Swiss army knife function to examine currently used and wanted
1399 * versus held caps. Release, flush, ack revoked caps to mds as
1400 * appropriate.
1401 *
1402 * CHECK_CAPS_NODELAY - caller is delayed work and we should not delay
1403 * cap release further.
1404 * CHECK_CAPS_AUTHONLY - we should only check the auth cap
1405 * CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
1406 * further delay.
1407 */
1408void ceph_check_caps(struct ceph_inode_info *ci, int flags,
1409 struct ceph_mds_session *session)
1410{
1411 struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode);
1412 struct ceph_mds_client *mdsc = &client->mdsc;
1413 struct inode *inode = &ci->vfs_inode;
1414 struct ceph_cap *cap;
1415 int file_wanted, used;
1416 int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */
1417 int drop_session_lock = session ? 0 : 1;
1418 int issued, implemented, want, retain, revoking, flushing = 0;
1419 int mds = -1; /* keep track of how far we've gone through i_caps list
1420 to avoid an infinite loop on retry */
1421 struct rb_node *p;
1422 int tried_invalidate = 0;
1423 int delayed = 0, sent = 0, force_requeue = 0, num;
1424 int queue_invalidate = 0;
1425 int is_delayed = flags & CHECK_CAPS_NODELAY;
1426
1427 /* if we are unmounting, flush any unused caps immediately. */
1428 if (mdsc->stopping)
1429 is_delayed = 1;
1430
1431 spin_lock(&inode->i_lock);
1432
1433 if (ci->i_ceph_flags & CEPH_I_FLUSH)
1434 flags |= CHECK_CAPS_FLUSH;
1435
1436 /* flush snaps first time around only */
1437 if (!list_empty(&ci->i_cap_snaps))
1438 __ceph_flush_snaps(ci, &session);
1439 goto retry_locked;
1440retry:
1441 spin_lock(&inode->i_lock);
1442retry_locked:
1443 file_wanted = __ceph_caps_file_wanted(ci);
1444 used = __ceph_caps_used(ci);
1445 want = file_wanted | used;
1446 issued = __ceph_caps_issued(ci, &implemented);
1447 revoking = implemented & ~issued;
1448
1449 retain = want | CEPH_CAP_PIN;
1450 if (!mdsc->stopping && inode->i_nlink > 0) {
1451 if (want) {
1452 retain |= CEPH_CAP_ANY; /* be greedy */
1453 } else {
1454 retain |= CEPH_CAP_ANY_SHARED;
1455 /*
1456 * keep RD only if we didn't have the file open RW,
1457 * because then the mds would revoke it anyway to
1458 * journal max_size=0.
1459 */
1460 if (ci->i_max_size == 0)
1461 retain |= CEPH_CAP_ANY_RD;
1462 }
1463 }
1464
1465 dout("check_caps %p file_want %s used %s dirty %s flushing %s"
1466 " issued %s revoking %s retain %s %s%s%s\n", inode,
1467 ceph_cap_string(file_wanted),
1468 ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
1469 ceph_cap_string(ci->i_flushing_caps),
1470 ceph_cap_string(issued), ceph_cap_string(revoking),
1471 ceph_cap_string(retain),
1472 (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
1473 (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "",
1474 (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");
1475
1476 /*
1477 * If we no longer need to hold onto old our caps, and we may
1478 * have cached pages, but don't want them, then try to invalidate.
1479 * If we fail, it's because pages are locked.... try again later.
1480 */
1481 if ((!is_delayed || mdsc->stopping) &&
1482 ci->i_wrbuffer_ref == 0 && /* no dirty pages... */
1483 ci->i_rdcache_gen && /* may have cached pages */
1484 (file_wanted == 0 || /* no open files */
1485 (revoking & CEPH_CAP_FILE_CACHE)) && /* or revoking cache */
1486 !tried_invalidate) {
1487 dout("check_caps trying to invalidate on %p\n", inode);
1488 if (try_nonblocking_invalidate(inode) < 0) {
1489 if (revoking & CEPH_CAP_FILE_CACHE) {
1490 dout("check_caps queuing invalidate\n");
1491 queue_invalidate = 1;
1492 ci->i_rdcache_revoking = ci->i_rdcache_gen;
1493 } else {
1494 dout("check_caps failed to invalidate pages\n");
1495 /* we failed to invalidate pages. check these
1496 caps again later. */
1497 force_requeue = 1;
1498 __cap_set_timeouts(mdsc, ci);
1499 }
1500 }
1501 tried_invalidate = 1;
1502 goto retry_locked;
1503 }
1504
1505 num = 0;
1506 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
1507 cap = rb_entry(p, struct ceph_cap, ci_node);
1508 num++;
1509
1510 /* avoid looping forever */
1511 if (mds >= cap->mds ||
1512 ((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap))
1513 continue;
1514
1515 /* NOTE: no side-effects allowed, until we take s_mutex */
1516
1517 revoking = cap->implemented & ~cap->issued;
1518 if (revoking)
1519 dout(" mds%d revoking %s\n", cap->mds,
1520 ceph_cap_string(revoking));
1521
1522 if (cap == ci->i_auth_cap &&
1523 (cap->issued & CEPH_CAP_FILE_WR)) {
1524 /* request larger max_size from MDS? */
1525 if (ci->i_wanted_max_size > ci->i_max_size &&
1526 ci->i_wanted_max_size > ci->i_requested_max_size) {
1527 dout("requesting new max_size\n");
1528 goto ack;
1529 }
1530
1531 /* approaching file_max? */
1532 if ((inode->i_size << 1) >= ci->i_max_size &&
1533 (ci->i_reported_size << 1) < ci->i_max_size) {
1534 dout("i_size approaching max_size\n");
1535 goto ack;
1536 }
1537 }
1538 /* flush anything dirty? */
1539 if (cap == ci->i_auth_cap && (flags & CHECK_CAPS_FLUSH) &&
1540 ci->i_dirty_caps) {
1541 dout("flushing dirty caps\n");
1542 goto ack;
1543 }
1544
1545 /* completed revocation? going down and there are no caps? */
1546 if (revoking && (revoking & used) == 0) {
1547 dout("completed revocation of %s\n",
1548 ceph_cap_string(cap->implemented & ~cap->issued));
1549 goto ack;
1550 }
1551
1552 /* want more caps from mds? */
1553 if (want & ~(cap->mds_wanted | cap->issued))
1554 goto ack;
1555
1556 /* things we might delay */
1557 if ((cap->issued & ~retain) == 0 &&
1558 cap->mds_wanted == want)
1559 continue; /* nope, all good */
1560
1561 if (is_delayed)
1562 goto ack;
1563
1564 /* delay? */
1565 if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
1566 time_before(jiffies, ci->i_hold_caps_max)) {
1567 dout(" delaying issued %s -> %s, wanted %s -> %s\n",
1568 ceph_cap_string(cap->issued),
1569 ceph_cap_string(cap->issued & retain),
1570 ceph_cap_string(cap->mds_wanted),
1571 ceph_cap_string(want));
1572 delayed++;
1573 continue;
1574 }
1575
1576ack:
1577 if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
1578 dout(" skipping %p I_NOFLUSH set\n", inode);
1579 continue;
1580 }
1581
1582 if (session && session != cap->session) {
1583 dout("oops, wrong session %p mutex\n", session);
1584 mutex_unlock(&session->s_mutex);
1585 session = NULL;
1586 }
1587 if (!session) {
1588 session = cap->session;
1589 if (mutex_trylock(&session->s_mutex) == 0) {
1590 dout("inverting session/ino locks on %p\n",
1591 session);
1592 spin_unlock(&inode->i_lock);
1593 if (took_snap_rwsem) {
1594 up_read(&mdsc->snap_rwsem);
1595 took_snap_rwsem = 0;
1596 }
1597 mutex_lock(&session->s_mutex);
1598 goto retry;
1599 }
1600 }
1601 /* take snap_rwsem after session mutex */
1602 if (!took_snap_rwsem) {
1603 if (down_read_trylock(&mdsc->snap_rwsem) == 0) {
1604 dout("inverting snap/in locks on %p\n",
1605 inode);
1606 spin_unlock(&inode->i_lock);
1607 down_read(&mdsc->snap_rwsem);
1608 took_snap_rwsem = 1;
1609 goto retry;
1610 }
1611 took_snap_rwsem = 1;
1612 }
1613
1614 if (cap == ci->i_auth_cap && ci->i_dirty_caps)
1615 flushing = __mark_caps_flushing(inode, session);
1616
1617 mds = cap->mds; /* remember mds, so we don't repeat */
1618 sent++;
1619
1620 /* __send_cap drops i_lock */
1621 delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want,
1622 retain, flushing, NULL);
1623 goto retry; /* retake i_lock and restart our cap scan. */
1624 }
1625
1626 /*
1627 * Reschedule delayed caps release if we delayed anything,
1628 * otherwise cancel.
1629 */
1630 if (delayed && is_delayed)
1631 force_requeue = 1; /* __send_cap delayed release; requeue */
1632 if (!delayed && !is_delayed)
1633 __cap_delay_cancel(mdsc, ci);
1634 else if (!is_delayed || force_requeue)
1635 __cap_delay_requeue(mdsc, ci);
1636
1637 spin_unlock(&inode->i_lock);
1638
1639 if (queue_invalidate)
1640 ceph_queue_invalidate(inode);
1641
1642 if (session && drop_session_lock)
1643 mutex_unlock(&session->s_mutex);
1644 if (took_snap_rwsem)
1645 up_read(&mdsc->snap_rwsem);
1646}
1647
1648/*
1649 * Try to flush dirty caps back to the auth mds.
1650 */
1651static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
1652 unsigned *flush_tid)
1653{
1654 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
1655 struct ceph_inode_info *ci = ceph_inode(inode);
1656 int unlock_session = session ? 0 : 1;
1657 int flushing = 0;
1658
1659retry:
1660 spin_lock(&inode->i_lock);
1661 if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
1662 dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode);
1663 goto out;
1664 }
1665 if (ci->i_dirty_caps && ci->i_auth_cap) {
1666 struct ceph_cap *cap = ci->i_auth_cap;
1667 int used = __ceph_caps_used(ci);
1668 int want = __ceph_caps_wanted(ci);
1669 int delayed;
1670
1671 if (!session) {
1672 spin_unlock(&inode->i_lock);
1673 session = cap->session;
1674 mutex_lock(&session->s_mutex);
1675 goto retry;
1676 }
1677 BUG_ON(session != cap->session);
1678 if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
1679 goto out;
1680
1681 flushing = __mark_caps_flushing(inode, session);
1682
1683 /* __send_cap drops i_lock */
1684 delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
1685 cap->issued | cap->implemented, flushing,
1686 flush_tid);
1687 if (!delayed)
1688 goto out_unlocked;
1689
1690 spin_lock(&inode->i_lock);
1691 __cap_delay_requeue(mdsc, ci);
1692 }
1693out:
1694 spin_unlock(&inode->i_lock);
1695out_unlocked:
1696 if (session && unlock_session)
1697 mutex_unlock(&session->s_mutex);
1698 return flushing;
1699}
1700
1701/*
1702 * Return true if we've flushed caps through the given flush_tid.
1703 */
1704static int caps_are_flushed(struct inode *inode, unsigned tid)
1705{
1706 struct ceph_inode_info *ci = ceph_inode(inode);
1707 int dirty, i, ret = 1;
1708
1709 spin_lock(&inode->i_lock);
1710 dirty = __ceph_caps_dirty(ci);
1711 for (i = 0; i < CEPH_CAP_BITS; i++)
1712 if ((ci->i_flushing_caps & (1 << i)) &&
1713 ci->i_cap_flush_tid[i] <= tid) {
1714 /* still flushing this bit */
1715 ret = 0;
1716 break;
1717 }
1718 spin_unlock(&inode->i_lock);
1719 return ret;
1720}
1721
1722/*
1723 * Wait on any unsafe replies for the given inode. First wait on the
1724 * newest request, and make that the upper bound. Then, if there are
1725 * more requests, keep waiting on the oldest as long as it is still older
1726 * than the original request.
1727 */
1728static void sync_write_wait(struct inode *inode)
1729{
1730 struct ceph_inode_info *ci = ceph_inode(inode);
1731 struct list_head *head = &ci->i_unsafe_writes;
1732 struct ceph_osd_request *req;
1733 u64 last_tid;
1734
1735 spin_lock(&ci->i_unsafe_lock);
1736 if (list_empty(head))
1737 goto out;
1738
1739 /* set upper bound as _last_ entry in chain */
1740 req = list_entry(head->prev, struct ceph_osd_request,
1741 r_unsafe_item);
1742 last_tid = req->r_tid;
1743
1744 do {
1745 ceph_osdc_get_request(req);
1746 spin_unlock(&ci->i_unsafe_lock);
1747 dout("sync_write_wait on tid %llu (until %llu)\n",
1748 req->r_tid, last_tid);
1749 wait_for_completion(&req->r_safe_completion);
1750 spin_lock(&ci->i_unsafe_lock);
1751 ceph_osdc_put_request(req);
1752
1753 /*
1754 * from here on look at first entry in chain, since we
1755 * only want to wait for anything older than last_tid
1756 */
1757 if (list_empty(head))
1758 break;
1759 req = list_entry(head->next, struct ceph_osd_request,
1760 r_unsafe_item);
1761 } while (req->r_tid < last_tid);
1762out:
1763 spin_unlock(&ci->i_unsafe_lock);
1764}
1765
1766int ceph_fsync(struct file *file, struct dentry *dentry, int datasync)
1767{
1768 struct inode *inode = dentry->d_inode;
1769 struct ceph_inode_info *ci = ceph_inode(inode);
1770 unsigned flush_tid;
1771 int ret;
1772 int dirty;
1773
1774 dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
1775 sync_write_wait(inode);
1776
1777 ret = filemap_write_and_wait(inode->i_mapping);
1778 if (ret < 0)
1779 return ret;
1780
1781 dirty = try_flush_caps(inode, NULL, &flush_tid);
1782 dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
1783
1784 /*
1785 * only wait on non-file metadata writeback (the mds
1786 * can recover size and mtime, so we don't need to
1787 * wait for that)
1788 */
1789 if (!datasync && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
1790 dout("fsync waiting for flush_tid %u\n", flush_tid);
1791 ret = wait_event_interruptible(ci->i_cap_wq,
1792 caps_are_flushed(inode, flush_tid));
1793 }
1794
1795 dout("fsync %p%s done\n", inode, datasync ? " datasync" : "");
1796 return ret;
1797}
1798
1799/*
1800 * Flush any dirty caps back to the mds. If we aren't asked to wait,
1801 * queue inode for flush but don't do so immediately, because we can
1802 * get by with fewer MDS messages if we wait for data writeback to
1803 * complete first.
1804 */
1805int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
1806{
1807 struct ceph_inode_info *ci = ceph_inode(inode);
1808 unsigned flush_tid;
1809 int err = 0;
1810 int dirty;
1811 int wait = wbc->sync_mode == WB_SYNC_ALL;
1812
1813 dout("write_inode %p wait=%d\n", inode, wait);
1814 if (wait) {
1815 dirty = try_flush_caps(inode, NULL, &flush_tid);
1816 if (dirty)
1817 err = wait_event_interruptible(ci->i_cap_wq,
1818 caps_are_flushed(inode, flush_tid));
1819 } else {
1820 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
1821
1822 spin_lock(&inode->i_lock);
1823 if (__ceph_caps_dirty(ci))
1824 __cap_delay_requeue_front(mdsc, ci);
1825 spin_unlock(&inode->i_lock);
1826 }
1827 return err;
1828}
1829
1830/*
1831 * After a recovering MDS goes active, we need to resend any caps
1832 * we were flushing.
1833 *
1834 * Caller holds session->s_mutex.
1835 */
1836static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
1837 struct ceph_mds_session *session)
1838{
1839 struct ceph_cap_snap *capsnap;
1840
1841 dout("kick_flushing_capsnaps mds%d\n", session->s_mds);
1842 list_for_each_entry(capsnap, &session->s_cap_snaps_flushing,
1843 flushing_item) {
1844 struct ceph_inode_info *ci = capsnap->ci;
1845 struct inode *inode = &ci->vfs_inode;
1846 struct ceph_cap *cap;
1847
1848 spin_lock(&inode->i_lock);
1849 cap = ci->i_auth_cap;
1850 if (cap && cap->session == session) {
1851 dout("kick_flushing_caps %p cap %p capsnap %p\n", inode,
1852 cap, capsnap);
1853 __ceph_flush_snaps(ci, &session);
1854 } else {
1855 pr_err("%p auth cap %p not mds%d ???\n", inode,
1856 cap, session->s_mds);
1857 spin_unlock(&inode->i_lock);
1858 }
1859 }
1860}
1861
1862void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
1863 struct ceph_mds_session *session)
1864{
1865 struct ceph_inode_info *ci;
1866
1867 kick_flushing_capsnaps(mdsc, session);
1868
1869 dout("kick_flushing_caps mds%d\n", session->s_mds);
1870 list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
1871 struct inode *inode = &ci->vfs_inode;
1872 struct ceph_cap *cap;
1873 int delayed = 0;
1874
1875 spin_lock(&inode->i_lock);
1876 cap = ci->i_auth_cap;
1877 if (cap && cap->session == session) {
1878 dout("kick_flushing_caps %p cap %p %s\n", inode,
1879 cap, ceph_cap_string(ci->i_flushing_caps));
1880 delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
1881 __ceph_caps_used(ci),
1882 __ceph_caps_wanted(ci),
1883 cap->issued | cap->implemented,
1884 ci->i_flushing_caps, NULL);
1885 if (delayed) {
1886 spin_lock(&inode->i_lock);
1887 __cap_delay_requeue(mdsc, ci);
1888 spin_unlock(&inode->i_lock);
1889 }
1890 } else {
1891 pr_err("%p auth cap %p not mds%d ???\n", inode,
1892 cap, session->s_mds);
1893 spin_unlock(&inode->i_lock);
1894 }
1895 }
1896}
1897
1898
1899/*
1900 * Take references to capabilities we hold, so that we don't release
1901 * them to the MDS prematurely.
1902 *
1903 * Protected by i_lock.
1904 */
1905static void __take_cap_refs(struct ceph_inode_info *ci, int got)
1906{
1907 if (got & CEPH_CAP_PIN)
1908 ci->i_pin_ref++;
1909 if (got & CEPH_CAP_FILE_RD)
1910 ci->i_rd_ref++;
1911 if (got & CEPH_CAP_FILE_CACHE)
1912 ci->i_rdcache_ref++;
1913 if (got & CEPH_CAP_FILE_WR)
1914 ci->i_wr_ref++;
1915 if (got & CEPH_CAP_FILE_BUFFER) {
1916 if (ci->i_wrbuffer_ref == 0)
1917 igrab(&ci->vfs_inode);
1918 ci->i_wrbuffer_ref++;
1919 dout("__take_cap_refs %p wrbuffer %d -> %d (?)\n",
1920 &ci->vfs_inode, ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref);
1921 }
1922}
1923
1924/*
1925 * Try to grab cap references. Specify those refs we @want, and the
1926 * minimal set we @need. Also include the larger offset we are writing
1927 * to (when applicable), and check against max_size here as well.
1928 * Note that caller is responsible for ensuring max_size increases are
1929 * requested from the MDS.
1930 */
1931static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
1932 int *got, loff_t endoff, int *check_max, int *err)
1933{
1934 struct inode *inode = &ci->vfs_inode;
1935 int ret = 0;
1936 int have, implemented;
1937 int file_wanted;
1938
1939 dout("get_cap_refs %p need %s want %s\n", inode,
1940 ceph_cap_string(need), ceph_cap_string(want));
1941 spin_lock(&inode->i_lock);
1942
1943 /* make sure file is actually open */
1944 file_wanted = __ceph_caps_file_wanted(ci);
1945 if ((file_wanted & need) == 0) {
1946 dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
1947 ceph_cap_string(need), ceph_cap_string(file_wanted));
1948 *err = -EBADF;
1949 ret = 1;
1950 goto out;
1951 }
1952
1953 if (need & CEPH_CAP_FILE_WR) {
1954 if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
1955 dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
1956 inode, endoff, ci->i_max_size);
1957 if (endoff > ci->i_wanted_max_size) {
1958 *check_max = 1;
1959 ret = 1;
1960 }
1961 goto out;
1962 }
1963 /*
1964 * If a sync write is in progress, we must wait, so that we
1965 * can get a final snapshot value for size+mtime.
1966 */
1967 if (__ceph_have_pending_cap_snap(ci)) {
1968 dout("get_cap_refs %p cap_snap_pending\n", inode);
1969 goto out;
1970 }
1971 }
1972 have = __ceph_caps_issued(ci, &implemented);
1973
1974 /*
1975 * disallow writes while a truncate is pending
1976 */
1977 if (ci->i_truncate_pending)
1978 have &= ~CEPH_CAP_FILE_WR;
1979
1980 if ((have & need) == need) {
1981 /*
1982 * Look at (implemented & ~have & not) so that we keep waiting
1983 * on transition from wanted -> needed caps. This is needed
1984 * for WRBUFFER|WR -> WR to avoid a new WR sync write from
1985 * going before a prior buffered writeback happens.
1986 */
1987 int not = want & ~(have & need);
1988 int revoking = implemented & ~have;
1989 dout("get_cap_refs %p have %s but not %s (revoking %s)\n",
1990 inode, ceph_cap_string(have), ceph_cap_string(not),
1991 ceph_cap_string(revoking));
1992 if ((revoking & not) == 0) {
1993 *got = need | (have & want);
1994 __take_cap_refs(ci, *got);
1995 ret = 1;
1996 }
1997 } else {
1998 dout("get_cap_refs %p have %s needed %s\n", inode,
1999 ceph_cap_string(have), ceph_cap_string(need));
2000 }
2001out:
2002 spin_unlock(&inode->i_lock);
2003 dout("get_cap_refs %p ret %d got %s\n", inode,
2004 ret, ceph_cap_string(*got));
2005 return ret;
2006}
2007
2008/*
2009 * Check the offset we are writing up to against our current
2010 * max_size. If necessary, tell the MDS we want to write to
2011 * a larger offset.
2012 */
2013static void check_max_size(struct inode *inode, loff_t endoff)
2014{
2015 struct ceph_inode_info *ci = ceph_inode(inode);
2016 int check = 0;
2017
2018 /* do we need to explicitly request a larger max_size? */
2019 spin_lock(&inode->i_lock);
2020 if ((endoff >= ci->i_max_size ||
2021 endoff > (inode->i_size << 1)) &&
2022 endoff > ci->i_wanted_max_size) {
2023 dout("write %p at large endoff %llu, req max_size\n",
2024 inode, endoff);
2025 ci->i_wanted_max_size = endoff;
2026 check = 1;
2027 }
2028 spin_unlock(&inode->i_lock);
2029 if (check)
2030 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
2031}
2032
2033/*
2034 * Wait for caps, and take cap references. If we can't get a WR cap
2035 * due to a small max_size, make sure we check_max_size (and possibly
2036 * ask the mds) so we don't get hung up indefinitely.
2037 */
2038int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, int *got,
2039 loff_t endoff)
2040{
2041 int check_max, ret, err;
2042
2043retry:
2044 if (endoff > 0)
2045 check_max_size(&ci->vfs_inode, endoff);
2046 check_max = 0;
2047 err = 0;
2048 ret = wait_event_interruptible(ci->i_cap_wq,
2049 try_get_cap_refs(ci, need, want,
2050 got, endoff,
2051 &check_max, &err));
2052 if (err)
2053 ret = err;
2054 if (check_max)
2055 goto retry;
2056 return ret;
2057}
2058
2059/*
2060 * Take cap refs. Caller must already know we hold at least one ref
2061 * on the caps in question or we don't know this is safe.
2062 */
2063void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
2064{
2065 spin_lock(&ci->vfs_inode.i_lock);
2066 __take_cap_refs(ci, caps);
2067 spin_unlock(&ci->vfs_inode.i_lock);
2068}
2069
2070/*
2071 * Release cap refs.
2072 *
2073 * If we released the last ref on any given cap, call ceph_check_caps
2074 * to release (or schedule a release).
2075 *
2076 * If we are releasing a WR cap (from a sync write), finalize any affected
2077 * cap_snap, and wake up any waiters.
2078 */
2079void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
2080{
2081 struct inode *inode = &ci->vfs_inode;
2082 int last = 0, put = 0, flushsnaps = 0, wake = 0;
2083 struct ceph_cap_snap *capsnap;
2084
2085 spin_lock(&inode->i_lock);
2086 if (had & CEPH_CAP_PIN)
2087 --ci->i_pin_ref;
2088 if (had & CEPH_CAP_FILE_RD)
2089 if (--ci->i_rd_ref == 0)
2090 last++;
2091 if (had & CEPH_CAP_FILE_CACHE)
2092 if (--ci->i_rdcache_ref == 0)
2093 last++;
2094 if (had & CEPH_CAP_FILE_BUFFER) {
2095 if (--ci->i_wrbuffer_ref == 0) {
2096 last++;
2097 put++;
2098 }
2099 dout("put_cap_refs %p wrbuffer %d -> %d (?)\n",
2100 inode, ci->i_wrbuffer_ref+1, ci->i_wrbuffer_ref);
2101 }
2102 if (had & CEPH_CAP_FILE_WR)
2103 if (--ci->i_wr_ref == 0) {
2104 last++;
2105 if (!list_empty(&ci->i_cap_snaps)) {
2106 capsnap = list_first_entry(&ci->i_cap_snaps,
2107 struct ceph_cap_snap,
2108 ci_item);
2109 if (capsnap->writing) {
2110 capsnap->writing = 0;
2111 flushsnaps =
2112 __ceph_finish_cap_snap(ci,
2113 capsnap);
2114 wake = 1;
2115 }
2116 }
2117 }
2118 spin_unlock(&inode->i_lock);
2119
2120 dout("put_cap_refs %p had %s %s\n", inode, ceph_cap_string(had),
2121 last ? "last" : "");
2122
2123 if (last && !flushsnaps)
2124 ceph_check_caps(ci, 0, NULL);
2125 else if (flushsnaps)
2126 ceph_flush_snaps(ci);
2127 if (wake)
2128 wake_up(&ci->i_cap_wq);
2129 if (put)
2130 iput(inode);
2131}
2132
2133/*
2134 * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap
2135 * context. Adjust per-snap dirty page accounting as appropriate.
2136 * Once all dirty data for a cap_snap is flushed, flush snapped file
2137 * metadata back to the MDS. If we dropped the last ref, call
2138 * ceph_check_caps.
2139 */
2140void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
2141 struct ceph_snap_context *snapc)
2142{
2143 struct inode *inode = &ci->vfs_inode;
2144 int last = 0;
2145 int last_snap = 0;
2146 int found = 0;
2147 struct ceph_cap_snap *capsnap = NULL;
2148
2149 spin_lock(&inode->i_lock);
2150 ci->i_wrbuffer_ref -= nr;
2151 last = !ci->i_wrbuffer_ref;
2152
2153 if (ci->i_head_snapc == snapc) {
2154 ci->i_wrbuffer_ref_head -= nr;
2155 if (!ci->i_wrbuffer_ref_head) {
2156 ceph_put_snap_context(ci->i_head_snapc);
2157 ci->i_head_snapc = NULL;
2158 }
2159 dout("put_wrbuffer_cap_refs on %p head %d/%d -> %d/%d %s\n",
2160 inode,
2161 ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr,
2162 ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
2163 last ? " LAST" : "");
2164 } else {
2165 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
2166 if (capsnap->context == snapc) {
2167 found = 1;
2168 capsnap->dirty_pages -= nr;
2169 last_snap = !capsnap->dirty_pages;
2170 break;
2171 }
2172 }
2173 BUG_ON(!found);
2174 dout("put_wrbuffer_cap_refs on %p cap_snap %p "
2175 " snap %lld %d/%d -> %d/%d %s%s\n",
2176 inode, capsnap, capsnap->context->seq,
2177 ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
2178 ci->i_wrbuffer_ref, capsnap->dirty_pages,
2179 last ? " (wrbuffer last)" : "",
2180 last_snap ? " (capsnap last)" : "");
2181 }
2182
2183 spin_unlock(&inode->i_lock);
2184
2185 if (last) {
2186 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
2187 iput(inode);
2188 } else if (last_snap) {
2189 ceph_flush_snaps(ci);
2190 wake_up(&ci->i_cap_wq);
2191 }
2192}
2193
2194/*
2195 * Handle a cap GRANT message from the MDS. (Note that a GRANT may
2196 * actually be a revocation if it specifies a smaller cap set.)
2197 *
2198 * caller holds s_mutex.
2199 * return value:
2200 * 0 - ok
2201 * 1 - check_caps on auth cap only (writeback)
2202 * 2 - check_caps (ack revoke)
2203 */
2204static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2205 struct ceph_mds_session *session,
2206 struct ceph_cap *cap,
2207 struct ceph_buffer *xattr_buf)
2208 __releases(inode->i_lock)
2209
2210{
2211 struct ceph_inode_info *ci = ceph_inode(inode);
2212 int mds = session->s_mds;
2213 int seq = le32_to_cpu(grant->seq);
2214 int newcaps = le32_to_cpu(grant->caps);
2215 int issued, implemented, used, wanted, dirty;
2216 u64 size = le64_to_cpu(grant->size);
2217 u64 max_size = le64_to_cpu(grant->max_size);
2218 struct timespec mtime, atime, ctime;
2219 int reply = 0;
2220 int wake = 0;
2221 int writeback = 0;
2222 int revoked_rdcache = 0;
2223 int queue_invalidate = 0;
2224
2225 dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
2226 inode, cap, mds, seq, ceph_cap_string(newcaps));
2227 dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
2228 inode->i_size);
2229
2230 /*
2231 * If CACHE is being revoked, and we have no dirty buffers,
2232 * try to invalidate (once). (If there are dirty buffers, we
2233 * will invalidate _after_ writeback.)
2234 */
2235 if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
2236 !ci->i_wrbuffer_ref) {
2237 if (try_nonblocking_invalidate(inode) == 0) {
2238 revoked_rdcache = 1;
2239 } else {
2240 /* there were locked pages.. invalidate later
2241 in a separate thread. */
2242 if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
2243 queue_invalidate = 1;
2244 ci->i_rdcache_revoking = ci->i_rdcache_gen;
2245 }
2246 }
2247 }
2248
2249 /* side effects now are allowed */
2250
2251 issued = __ceph_caps_issued(ci, &implemented);
2252 issued |= implemented | __ceph_caps_dirty(ci);
2253
2254 cap->cap_gen = session->s_cap_gen;
2255
2256 __check_cap_issue(ci, cap, newcaps);
2257
2258 if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
2259 inode->i_mode = le32_to_cpu(grant->mode);
2260 inode->i_uid = le32_to_cpu(grant->uid);
2261 inode->i_gid = le32_to_cpu(grant->gid);
2262 dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
2263 inode->i_uid, inode->i_gid);
2264 }
2265
2266 if ((issued & CEPH_CAP_LINK_EXCL) == 0)
2267 inode->i_nlink = le32_to_cpu(grant->nlink);
2268
2269 if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) {
2270 int len = le32_to_cpu(grant->xattr_len);
2271 u64 version = le64_to_cpu(grant->xattr_version);
2272
2273 if (version > ci->i_xattrs.version) {
2274 dout(" got new xattrs v%llu on %p len %d\n",
2275 version, inode, len);
2276 if (ci->i_xattrs.blob)
2277 ceph_buffer_put(ci->i_xattrs.blob);
2278 ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
2279 ci->i_xattrs.version = version;
2280 }
2281 }
2282
2283 /* size/ctime/mtime/atime? */
2284 ceph_fill_file_size(inode, issued,
2285 le32_to_cpu(grant->truncate_seq),
2286 le64_to_cpu(grant->truncate_size), size);
2287 ceph_decode_timespec(&mtime, &grant->mtime);
2288 ceph_decode_timespec(&atime, &grant->atime);
2289 ceph_decode_timespec(&ctime, &grant->ctime);
2290 ceph_fill_file_time(inode, issued,
2291 le32_to_cpu(grant->time_warp_seq), &ctime, &mtime,
2292 &atime);
2293
2294 /* max size increase? */
2295 if (max_size != ci->i_max_size) {
2296 dout("max_size %lld -> %llu\n", ci->i_max_size, max_size);
2297 ci->i_max_size = max_size;
2298 if (max_size >= ci->i_wanted_max_size) {
2299 ci->i_wanted_max_size = 0; /* reset */
2300 ci->i_requested_max_size = 0;
2301 }
2302 wake = 1;
2303 }
2304
2305 /* check cap bits */
2306 wanted = __ceph_caps_wanted(ci);
2307 used = __ceph_caps_used(ci);
2308 dirty = __ceph_caps_dirty(ci);
2309 dout(" my wanted = %s, used = %s, dirty %s\n",
2310 ceph_cap_string(wanted),
2311 ceph_cap_string(used),
2312 ceph_cap_string(dirty));
2313 if (wanted != le32_to_cpu(grant->wanted)) {
2314 dout("mds wanted %s -> %s\n",
2315 ceph_cap_string(le32_to_cpu(grant->wanted)),
2316 ceph_cap_string(wanted));
2317 grant->wanted = cpu_to_le32(wanted);
2318 }
2319
2320 cap->seq = seq;
2321
2322 /* file layout may have changed */
2323 ci->i_layout = grant->layout;
2324
2325 /* revocation, grant, or no-op? */
2326 if (cap->issued & ~newcaps) {
2327 dout("revocation: %s -> %s\n", ceph_cap_string(cap->issued),
2328 ceph_cap_string(newcaps));
2329 if ((used & ~newcaps) & CEPH_CAP_FILE_BUFFER)
2330 writeback = 1; /* will delay ack */
2331 else if (dirty & ~newcaps)
2332 reply = 1; /* initiate writeback in check_caps */
2333 else if (((used & ~newcaps) & CEPH_CAP_FILE_CACHE) == 0 ||
2334 revoked_rdcache)
2335 reply = 2; /* send revoke ack in check_caps */
2336 cap->issued = newcaps;
2337 } else if (cap->issued == newcaps) {
2338 dout("caps unchanged: %s -> %s\n",
2339 ceph_cap_string(cap->issued), ceph_cap_string(newcaps));
2340 } else {
2341 dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),
2342 ceph_cap_string(newcaps));
2343 cap->issued = newcaps;
2344 cap->implemented |= newcaps; /* add bits only, to
2345 * avoid stepping on a
2346 * pending revocation */
2347 wake = 1;
2348 }
2349
2350 spin_unlock(&inode->i_lock);
2351 if (writeback)
2352 /*
2353 * queue inode for writeback: we can't actually call
2354 * filemap_write_and_wait, etc. from message handler
2355 * context.
2356 */
2357 ceph_queue_writeback(inode);
2358 if (queue_invalidate)
2359 ceph_queue_invalidate(inode);
2360 if (wake)
2361 wake_up(&ci->i_cap_wq);
2362 return reply;
2363}
2364
2365/*
2366 * Handle FLUSH_ACK from MDS, indicating that metadata we sent to the
2367 * MDS has been safely committed.
2368 */
2369static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
2370 struct ceph_mds_caps *m,
2371 struct ceph_mds_session *session,
2372 struct ceph_cap *cap)
2373 __releases(inode->i_lock)
2374{
2375 struct ceph_inode_info *ci = ceph_inode(inode);
2376 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
2377 unsigned seq = le32_to_cpu(m->seq);
2378 int dirty = le32_to_cpu(m->dirty);
2379 int cleaned = 0;
2380 int drop = 0;
2381 int i;
2382
2383 for (i = 0; i < CEPH_CAP_BITS; i++)
2384 if ((dirty & (1 << i)) &&
2385 flush_tid == ci->i_cap_flush_tid[i])
2386 cleaned |= 1 << i;
2387
2388 dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
2389 " flushing %s -> %s\n",
2390 inode, session->s_mds, seq, ceph_cap_string(dirty),
2391 ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps),
2392 ceph_cap_string(ci->i_flushing_caps & ~cleaned));
2393
2394 if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned))
2395 goto out;
2396
2397 ci->i_flushing_caps &= ~cleaned;
2398
2399 spin_lock(&mdsc->cap_dirty_lock);
2400 if (ci->i_flushing_caps == 0) {
2401 list_del_init(&ci->i_flushing_item);
2402 if (!list_empty(&session->s_cap_flushing))
2403 dout(" mds%d still flushing cap on %p\n",
2404 session->s_mds,
2405 &list_entry(session->s_cap_flushing.next,
2406 struct ceph_inode_info,
2407 i_flushing_item)->vfs_inode);
2408 mdsc->num_cap_flushing--;
2409 wake_up(&mdsc->cap_flushing_wq);
2410 dout(" inode %p now !flushing\n", inode);
2411
2412 if (ci->i_dirty_caps == 0) {
2413 dout(" inode %p now clean\n", inode);
2414 BUG_ON(!list_empty(&ci->i_dirty_item));
2415 drop = 1;
2416 } else {
2417 BUG_ON(list_empty(&ci->i_dirty_item));
2418 }
2419 }
2420 spin_unlock(&mdsc->cap_dirty_lock);
2421 wake_up(&ci->i_cap_wq);
2422
2423out:
2424 spin_unlock(&inode->i_lock);
2425 if (drop)
2426 iput(inode);
2427}
2428
2429/*
2430 * Handle FLUSHSNAP_ACK. MDS has flushed snap data to disk and we can
2431 * throw away our cap_snap.
2432 *
2433 * Caller hold s_mutex.
2434 */
2435static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
2436 struct ceph_mds_caps *m,
2437 struct ceph_mds_session *session)
2438{
2439 struct ceph_inode_info *ci = ceph_inode(inode);
2440 u64 follows = le64_to_cpu(m->snap_follows);
2441 struct ceph_cap_snap *capsnap;
2442 int drop = 0;
2443
2444 dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
2445 inode, ci, session->s_mds, follows);
2446
2447 spin_lock(&inode->i_lock);
2448 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
2449 if (capsnap->follows == follows) {
2450 if (capsnap->flush_tid != flush_tid) {
2451 dout(" cap_snap %p follows %lld tid %lld !="
2452 " %lld\n", capsnap, follows,
2453 flush_tid, capsnap->flush_tid);
2454 break;
2455 }
2456 WARN_ON(capsnap->dirty_pages || capsnap->writing);
2457 dout(" removing cap_snap %p follows %lld\n",
2458 capsnap, follows);
2459 ceph_put_snap_context(capsnap->context);
2460 list_del(&capsnap->ci_item);
2461 list_del(&capsnap->flushing_item);
2462 ceph_put_cap_snap(capsnap);
2463 drop = 1;
2464 break;
2465 } else {
2466 dout(" skipping cap_snap %p follows %lld\n",
2467 capsnap, capsnap->follows);
2468 }
2469 }
2470 spin_unlock(&inode->i_lock);
2471 if (drop)
2472 iput(inode);
2473}
2474
2475/*
2476 * Handle TRUNC from MDS, indicating file truncation.
2477 *
2478 * caller hold s_mutex.
2479 */
2480static void handle_cap_trunc(struct inode *inode,
2481 struct ceph_mds_caps *trunc,
2482 struct ceph_mds_session *session)
2483 __releases(inode->i_lock)
2484{
2485 struct ceph_inode_info *ci = ceph_inode(inode);
2486 int mds = session->s_mds;
2487 int seq = le32_to_cpu(trunc->seq);
2488 u32 truncate_seq = le32_to_cpu(trunc->truncate_seq);
2489 u64 truncate_size = le64_to_cpu(trunc->truncate_size);
2490 u64 size = le64_to_cpu(trunc->size);
2491 int implemented = 0;
2492 int dirty = __ceph_caps_dirty(ci);
2493 int issued = __ceph_caps_issued(ceph_inode(inode), &implemented);
2494 int queue_trunc = 0;
2495
2496 issued |= implemented | dirty;
2497
2498 dout("handle_cap_trunc inode %p mds%d seq %d to %lld seq %d\n",
2499 inode, mds, seq, truncate_size, truncate_seq);
2500 queue_trunc = ceph_fill_file_size(inode, issued,
2501 truncate_seq, truncate_size, size);
2502 spin_unlock(&inode->i_lock);
2503
2504 if (queue_trunc)
2505 ceph_queue_vmtruncate(inode);
2506}
2507
2508/*
2509 * Handle EXPORT from MDS. Cap is being migrated _from_ this mds to a
2510 * different one. If we are the most recent migration we've seen (as
2511 * indicated by mseq), make note of the migrating cap bits for the
2512 * duration (until we see the corresponding IMPORT).
2513 *
2514 * caller holds s_mutex
2515 */
2516static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
2517 struct ceph_mds_session *session)
2518{
2519 struct ceph_inode_info *ci = ceph_inode(inode);
2520 int mds = session->s_mds;
2521 unsigned mseq = le32_to_cpu(ex->migrate_seq);
2522 struct ceph_cap *cap = NULL, *t;
2523 struct rb_node *p;
2524 int remember = 1;
2525
2526 dout("handle_cap_export inode %p ci %p mds%d mseq %d\n",
2527 inode, ci, mds, mseq);
2528
2529 spin_lock(&inode->i_lock);
2530
2531 /* make sure we haven't seen a higher mseq */
2532 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
2533 t = rb_entry(p, struct ceph_cap, ci_node);
2534 if (ceph_seq_cmp(t->mseq, mseq) > 0) {
2535 dout(" higher mseq on cap from mds%d\n",
2536 t->session->s_mds);
2537 remember = 0;
2538 }
2539 if (t->session->s_mds == mds)
2540 cap = t;
2541 }
2542
2543 if (cap) {
2544 if (remember) {
2545 /* make note */
2546 ci->i_cap_exporting_mds = mds;
2547 ci->i_cap_exporting_mseq = mseq;
2548 ci->i_cap_exporting_issued = cap->issued;
2549 }
2550 __ceph_remove_cap(cap);
2551 } else {
2552 WARN_ON(!cap);
2553 }
2554
2555 spin_unlock(&inode->i_lock);
2556}
2557
2558/*
2559 * Handle cap IMPORT. If there are temp bits from an older EXPORT,
2560 * clean them up.
2561 *
2562 * caller holds s_mutex.
2563 */
2564static void handle_cap_import(struct ceph_mds_client *mdsc,
2565 struct inode *inode, struct ceph_mds_caps *im,
2566 struct ceph_mds_session *session,
2567 void *snaptrace, int snaptrace_len)
2568{
2569 struct ceph_inode_info *ci = ceph_inode(inode);
2570 int mds = session->s_mds;
2571 unsigned issued = le32_to_cpu(im->caps);
2572 unsigned wanted = le32_to_cpu(im->wanted);
2573 unsigned seq = le32_to_cpu(im->seq);
2574 unsigned mseq = le32_to_cpu(im->migrate_seq);
2575 u64 realmino = le64_to_cpu(im->realm);
2576 u64 cap_id = le64_to_cpu(im->cap_id);
2577
2578 if (ci->i_cap_exporting_mds >= 0 &&
2579 ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) {
2580 dout("handle_cap_import inode %p ci %p mds%d mseq %d"
2581 " - cleared exporting from mds%d\n",
2582 inode, ci, mds, mseq,
2583 ci->i_cap_exporting_mds);
2584 ci->i_cap_exporting_issued = 0;
2585 ci->i_cap_exporting_mseq = 0;
2586 ci->i_cap_exporting_mds = -1;
2587 } else {
2588 dout("handle_cap_import inode %p ci %p mds%d mseq %d\n",
2589 inode, ci, mds, mseq);
2590 }
2591
2592 down_write(&mdsc->snap_rwsem);
2593 ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len,
2594 false);
2595 downgrade_write(&mdsc->snap_rwsem);
2596 ceph_add_cap(inode, session, cap_id, -1,
2597 issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH,
2598 NULL /* no caps context */);
2599 try_flush_caps(inode, session, NULL);
2600 up_read(&mdsc->snap_rwsem);
2601}
2602
2603/*
2604 * Handle a caps message from the MDS.
2605 *
2606 * Identify the appropriate session, inode, and call the right handler
2607 * based on the cap op.
2608 */
2609void ceph_handle_caps(struct ceph_mds_session *session,
2610 struct ceph_msg *msg)
2611{
2612 struct ceph_mds_client *mdsc = session->s_mdsc;
2613 struct super_block *sb = mdsc->client->sb;
2614 struct inode *inode;
2615 struct ceph_cap *cap;
2616 struct ceph_mds_caps *h;
2617 int mds = session->s_mds;
2618 int op;
2619 u32 seq;
2620 struct ceph_vino vino;
2621 u64 cap_id;
2622 u64 size, max_size;
2623 u64 tid;
2624 int check_caps = 0;
2625 void *snaptrace;
2626 int r;
2627
2628 dout("handle_caps from mds%d\n", mds);
2629
2630 /* decode */
2631 tid = le64_to_cpu(msg->hdr.tid);
2632 if (msg->front.iov_len < sizeof(*h))
2633 goto bad;
2634 h = msg->front.iov_base;
2635 snaptrace = h + 1;
2636 op = le32_to_cpu(h->op);
2637 vino.ino = le64_to_cpu(h->ino);
2638 vino.snap = CEPH_NOSNAP;
2639 cap_id = le64_to_cpu(h->cap_id);
2640 seq = le32_to_cpu(h->seq);
2641 size = le64_to_cpu(h->size);
2642 max_size = le64_to_cpu(h->max_size);
2643
2644 mutex_lock(&session->s_mutex);
2645 session->s_seq++;
2646 dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
2647 (unsigned)seq);
2648
2649 /* lookup ino */
2650 inode = ceph_find_inode(sb, vino);
2651 dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
2652 vino.snap, inode);
2653 if (!inode) {
2654 dout(" i don't have ino %llx\n", vino.ino);
2655 goto done;
2656 }
2657
2658 /* these will work even if we don't have a cap yet */
2659 switch (op) {
2660 case CEPH_CAP_OP_FLUSHSNAP_ACK:
2661 handle_cap_flushsnap_ack(inode, tid, h, session);
2662 goto done;
2663
2664 case CEPH_CAP_OP_EXPORT:
2665 handle_cap_export(inode, h, session);
2666 goto done;
2667
2668 case CEPH_CAP_OP_IMPORT:
2669 handle_cap_import(mdsc, inode, h, session,
2670 snaptrace, le32_to_cpu(h->snap_trace_len));
2671 check_caps = 1; /* we may have sent a RELEASE to the old auth */
2672 goto done;
2673 }
2674
2675 /* the rest require a cap */
2676 spin_lock(&inode->i_lock);
2677 cap = __get_cap_for_mds(ceph_inode(inode), mds);
2678 if (!cap) {
2679 dout("no cap on %p ino %llx.%llx from mds%d, releasing\n",
2680 inode, ceph_ino(inode), ceph_snap(inode), mds);
2681 spin_unlock(&inode->i_lock);
2682 goto done;
2683 }
2684
2685 /* note that each of these drops i_lock for us */
2686 switch (op) {
2687 case CEPH_CAP_OP_REVOKE:
2688 case CEPH_CAP_OP_GRANT:
2689 r = handle_cap_grant(inode, h, session, cap, msg->middle);
2690 if (r == 1)
2691 ceph_check_caps(ceph_inode(inode),
2692 CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
2693 session);
2694 else if (r == 2)
2695 ceph_check_caps(ceph_inode(inode),
2696 CHECK_CAPS_NODELAY,
2697 session);
2698 break;
2699
2700 case CEPH_CAP_OP_FLUSH_ACK:
2701 handle_cap_flush_ack(inode, tid, h, session, cap);
2702 break;
2703
2704 case CEPH_CAP_OP_TRUNC:
2705 handle_cap_trunc(inode, h, session);
2706 break;
2707
2708 default:
2709 spin_unlock(&inode->i_lock);
2710 pr_err("ceph_handle_caps: unknown cap op %d %s\n", op,
2711 ceph_cap_op_name(op));
2712 }
2713
2714done:
2715 mutex_unlock(&session->s_mutex);
2716
2717 if (check_caps)
2718 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY, NULL);
2719 if (inode)
2720 iput(inode);
2721 return;
2722
2723bad:
2724 pr_err("ceph_handle_caps: corrupt message\n");
2725 ceph_msg_dump(msg);
2726 return;
2727}
2728
2729/*
2730 * Delayed work handler to process end of delayed cap release LRU list.
2731 */
2732void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
2733{
2734 struct ceph_inode_info *ci;
2735 int flags = CHECK_CAPS_NODELAY;
2736
2737 dout("check_delayed_caps\n");
2738 while (1) {
2739 spin_lock(&mdsc->cap_delay_lock);
2740 if (list_empty(&mdsc->cap_delay_list))
2741 break;
2742 ci = list_first_entry(&mdsc->cap_delay_list,
2743 struct ceph_inode_info,
2744 i_cap_delay_list);
2745 if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 &&
2746 time_before(jiffies, ci->i_hold_caps_max))
2747 break;
2748 list_del_init(&ci->i_cap_delay_list);
2749 spin_unlock(&mdsc->cap_delay_lock);
2750 dout("check_delayed_caps on %p\n", &ci->vfs_inode);
2751 ceph_check_caps(ci, flags, NULL);
2752 }
2753 spin_unlock(&mdsc->cap_delay_lock);
2754}
2755
2756/*
2757 * Flush all dirty caps to the mds
2758 */
2759void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
2760{
2761 struct ceph_inode_info *ci, *nci = NULL;
2762 struct inode *inode, *ninode = NULL;
2763 struct list_head *p, *n;
2764
2765 dout("flush_dirty_caps\n");
2766 spin_lock(&mdsc->cap_dirty_lock);
2767 list_for_each_safe(p, n, &mdsc->cap_dirty) {
2768 if (nci) {
2769 ci = nci;
2770 inode = ninode;
2771 ci->i_ceph_flags &= ~CEPH_I_NOFLUSH;
2772 dout("flush_dirty_caps inode %p (was next inode)\n",
2773 inode);
2774 } else {
2775 ci = list_entry(p, struct ceph_inode_info,
2776 i_dirty_item);
2777 inode = igrab(&ci->vfs_inode);
2778 BUG_ON(!inode);
2779 dout("flush_dirty_caps inode %p\n", inode);
2780 }
2781 if (n != &mdsc->cap_dirty) {
2782 nci = list_entry(n, struct ceph_inode_info,
2783 i_dirty_item);
2784 ninode = igrab(&nci->vfs_inode);
2785 BUG_ON(!ninode);
2786 nci->i_ceph_flags |= CEPH_I_NOFLUSH;
2787 dout("flush_dirty_caps next inode %p, noflush\n",
2788 ninode);
2789 } else {
2790 nci = NULL;
2791 ninode = NULL;
2792 }
2793 spin_unlock(&mdsc->cap_dirty_lock);
2794 if (inode) {
2795 ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH,
2796 NULL);
2797 iput(inode);
2798 }
2799 spin_lock(&mdsc->cap_dirty_lock);
2800 }
2801 spin_unlock(&mdsc->cap_dirty_lock);
2802}
2803
2804/*
2805 * Drop open file reference. If we were the last open file,
2806 * we may need to release capabilities to the MDS (or schedule
2807 * their delayed release).
2808 */
2809void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
2810{
2811 struct inode *inode = &ci->vfs_inode;
2812 int last = 0;
2813
2814 spin_lock(&inode->i_lock);
2815 dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode,
2816 ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1);
2817 BUG_ON(ci->i_nr_by_mode[fmode] == 0);
2818 if (--ci->i_nr_by_mode[fmode] == 0)
2819 last++;
2820 spin_unlock(&inode->i_lock);
2821
2822 if (last && ci->i_vino.snap == CEPH_NOSNAP)
2823 ceph_check_caps(ci, 0, NULL);
2824}
2825
2826/*
2827 * Helpers for embedding cap and dentry lease releases into mds
2828 * requests.
2829 *
2830 * @force is used by dentry_release (below) to force inclusion of a
2831 * record for the directory inode, even when there aren't any caps to
2832 * drop.
2833 */
2834int ceph_encode_inode_release(void **p, struct inode *inode,
2835 int mds, int drop, int unless, int force)
2836{
2837 struct ceph_inode_info *ci = ceph_inode(inode);
2838 struct ceph_cap *cap;
2839 struct ceph_mds_request_release *rel = *p;
2840 int ret = 0;
2841
2842 dout("encode_inode_release %p mds%d drop %s unless %s\n", inode,
2843 mds, ceph_cap_string(drop), ceph_cap_string(unless));
2844
2845 spin_lock(&inode->i_lock);
2846 cap = __get_cap_for_mds(ci, mds);
2847 if (cap && __cap_is_valid(cap)) {
2848 if (force ||
2849 ((cap->issued & drop) &&
2850 (cap->issued & unless) == 0)) {
2851 if ((cap->issued & drop) &&
2852 (cap->issued & unless) == 0) {
2853 dout("encode_inode_release %p cap %p %s -> "
2854 "%s\n", inode, cap,
2855 ceph_cap_string(cap->issued),
2856 ceph_cap_string(cap->issued & ~drop));
2857 cap->issued &= ~drop;
2858 cap->implemented &= ~drop;
2859 if (ci->i_ceph_flags & CEPH_I_NODELAY) {
2860 int wanted = __ceph_caps_wanted(ci);
2861 dout(" wanted %s -> %s (act %s)\n",
2862 ceph_cap_string(cap->mds_wanted),
2863 ceph_cap_string(cap->mds_wanted &
2864 ~wanted),
2865 ceph_cap_string(wanted));
2866 cap->mds_wanted &= wanted;
2867 }
2868 } else {
2869 dout("encode_inode_release %p cap %p %s"
2870 " (force)\n", inode, cap,
2871 ceph_cap_string(cap->issued));
2872 }
2873
2874 rel->ino = cpu_to_le64(ceph_ino(inode));
2875 rel->cap_id = cpu_to_le64(cap->cap_id);
2876 rel->seq = cpu_to_le32(cap->seq);
2877 rel->issue_seq = cpu_to_le32(cap->issue_seq),
2878 rel->mseq = cpu_to_le32(cap->mseq);
2879 rel->caps = cpu_to_le32(cap->issued);
2880 rel->wanted = cpu_to_le32(cap->mds_wanted);
2881 rel->dname_len = 0;
2882 rel->dname_seq = 0;
2883 *p += sizeof(*rel);
2884 ret = 1;
2885 } else {
2886 dout("encode_inode_release %p cap %p %s\n",
2887 inode, cap, ceph_cap_string(cap->issued));
2888 }
2889 }
2890 spin_unlock(&inode->i_lock);
2891 return ret;
2892}
2893
2894int ceph_encode_dentry_release(void **p, struct dentry *dentry,
2895 int mds, int drop, int unless)
2896{
2897 struct inode *dir = dentry->d_parent->d_inode;
2898 struct ceph_mds_request_release *rel = *p;
2899 struct ceph_dentry_info *di = ceph_dentry(dentry);
2900 int force = 0;
2901 int ret;
2902
2903 /*
2904 * force an record for the directory caps if we have a dentry lease.
2905 * this is racy (can't take i_lock and d_lock together), but it
2906 * doesn't have to be perfect; the mds will revoke anything we don't
2907 * release.
2908 */
2909 spin_lock(&dentry->d_lock);
2910 if (di->lease_session && di->lease_session->s_mds == mds)
2911 force = 1;
2912 spin_unlock(&dentry->d_lock);
2913
2914 ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
2915
2916 spin_lock(&dentry->d_lock);
2917 if (ret && di->lease_session && di->lease_session->s_mds == mds) {
2918 dout("encode_dentry_release %p mds%d seq %d\n",
2919 dentry, mds, (int)di->lease_seq);
2920 rel->dname_len = cpu_to_le32(dentry->d_name.len);
2921 memcpy(*p, dentry->d_name.name, dentry->d_name.len);
2922 *p += dentry->d_name.len;
2923 rel->dname_seq = cpu_to_le32(di->lease_seq);
2924 }
2925 spin_unlock(&dentry->d_lock);
2926 return ret;
2927}
diff --git a/fs/ceph/ceph_debug.h b/fs/ceph/ceph_debug.h
new file mode 100644
index 000000000000..1818c2305610
--- /dev/null
+++ b/fs/ceph/ceph_debug.h
@@ -0,0 +1,37 @@
1#ifndef _FS_CEPH_DEBUG_H
2#define _FS_CEPH_DEBUG_H
3
4#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
5
6#ifdef CONFIG_CEPH_FS_PRETTYDEBUG
7
8/*
9 * wrap pr_debug to include a filename:lineno prefix on each line.
10 * this incurs some overhead (kernel size and execution time) due to
11 * the extra function call at each call site.
12 */
13
14# if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
15extern const char *ceph_file_part(const char *s, int len);
16# define dout(fmt, ...) \
17 pr_debug(" %12.12s:%-4d : " fmt, \
18 ceph_file_part(__FILE__, sizeof(__FILE__)), \
19 __LINE__, ##__VA_ARGS__)
20# else
21/* faux printk call just to see any compiler warnings. */
22# define dout(fmt, ...) do { \
23 if (0) \
24 printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
25 } while (0)
26# endif
27
28#else
29
30/*
31 * or, just wrap pr_debug
32 */
33# define dout(fmt, ...) pr_debug(" " fmt, ##__VA_ARGS__)
34
35#endif
36
37#endif
diff --git a/fs/ceph/ceph_frag.c b/fs/ceph/ceph_frag.c
new file mode 100644
index 000000000000..ab6cf35c4091
--- /dev/null
+++ b/fs/ceph/ceph_frag.c
@@ -0,0 +1,21 @@
1/*
2 * Ceph 'frag' type
3 */
4#include "types.h"
5
6int ceph_frag_compare(__u32 a, __u32 b)
7{
8 unsigned va = ceph_frag_value(a);
9 unsigned vb = ceph_frag_value(b);
10 if (va < vb)
11 return -1;
12 if (va > vb)
13 return 1;
14 va = ceph_frag_bits(a);
15 vb = ceph_frag_bits(b);
16 if (va < vb)
17 return -1;
18 if (va > vb)
19 return 1;
20 return 0;
21}
diff --git a/fs/ceph/ceph_frag.h b/fs/ceph/ceph_frag.h
new file mode 100644
index 000000000000..793f50cb7c22
--- /dev/null
+++ b/fs/ceph/ceph_frag.h
@@ -0,0 +1,109 @@
1#ifndef _FS_CEPH_FRAG_H
2#define _FS_CEPH_FRAG_H
3
4/*
5 * "Frags" are a way to describe a subset of a 32-bit number space,
6 * using a mask and a value to match against that mask. Any given frag
7 * (subset of the number space) can be partitioned into 2^n sub-frags.
8 *
9 * Frags are encoded into a 32-bit word:
10 * 8 upper bits = "bits"
11 * 24 lower bits = "value"
12 * (We could go to 5+27 bits, but who cares.)
13 *
14 * We use the _most_ significant bits of the 24 bit value. This makes
15 * values logically sort.
16 *
17 * Unfortunately, because the "bits" field is still in the high bits, we
18 * can't sort encoded frags numerically. However, it does allow you
19 * to feed encoded frags as values into frag_contains_value.
20 */
21static inline __u32 ceph_frag_make(__u32 b, __u32 v)
22{
23 return (b << 24) |
24 (v & (0xffffffu << (24-b)) & 0xffffffu);
25}
26static inline __u32 ceph_frag_bits(__u32 f)
27{
28 return f >> 24;
29}
30static inline __u32 ceph_frag_value(__u32 f)
31{
32 return f & 0xffffffu;
33}
34static inline __u32 ceph_frag_mask(__u32 f)
35{
36 return (0xffffffu << (24-ceph_frag_bits(f))) & 0xffffffu;
37}
38static inline __u32 ceph_frag_mask_shift(__u32 f)
39{
40 return 24 - ceph_frag_bits(f);
41}
42
43static inline int ceph_frag_contains_value(__u32 f, __u32 v)
44{
45 return (v & ceph_frag_mask(f)) == ceph_frag_value(f);
46}
47static inline int ceph_frag_contains_frag(__u32 f, __u32 sub)
48{
49 /* is sub as specific as us, and contained by us? */
50 return ceph_frag_bits(sub) >= ceph_frag_bits(f) &&
51 (ceph_frag_value(sub) & ceph_frag_mask(f)) == ceph_frag_value(f);
52}
53
54static inline __u32 ceph_frag_parent(__u32 f)
55{
56 return ceph_frag_make(ceph_frag_bits(f) - 1,
57 ceph_frag_value(f) & (ceph_frag_mask(f) << 1));
58}
59static inline int ceph_frag_is_left_child(__u32 f)
60{
61 return ceph_frag_bits(f) > 0 &&
62 (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 0;
63}
64static inline int ceph_frag_is_right_child(__u32 f)
65{
66 return ceph_frag_bits(f) > 0 &&
67 (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 1;
68}
69static inline __u32 ceph_frag_sibling(__u32 f)
70{
71 return ceph_frag_make(ceph_frag_bits(f),
72 ceph_frag_value(f) ^ (0x1000000 >> ceph_frag_bits(f)));
73}
74static inline __u32 ceph_frag_left_child(__u32 f)
75{
76 return ceph_frag_make(ceph_frag_bits(f)+1, ceph_frag_value(f));
77}
78static inline __u32 ceph_frag_right_child(__u32 f)
79{
80 return ceph_frag_make(ceph_frag_bits(f)+1,
81 ceph_frag_value(f) | (0x1000000 >> (1+ceph_frag_bits(f))));
82}
83static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
84{
85 int newbits = ceph_frag_bits(f) + by;
86 return ceph_frag_make(newbits,
87 ceph_frag_value(f) | (i << (24 - newbits)));
88}
89static inline int ceph_frag_is_leftmost(__u32 f)
90{
91 return ceph_frag_value(f) == 0;
92}
93static inline int ceph_frag_is_rightmost(__u32 f)
94{
95 return ceph_frag_value(f) == ceph_frag_mask(f);
96}
97static inline __u32 ceph_frag_next(__u32 f)
98{
99 return ceph_frag_make(ceph_frag_bits(f),
100 ceph_frag_value(f) + (0x1000000 >> ceph_frag_bits(f)));
101}
102
103/*
104 * comparator to sort frags logically, as when traversing the
105 * number space in ascending order...
106 */
107int ceph_frag_compare(__u32 a, __u32 b);
108
109#endif
diff --git a/fs/ceph/ceph_fs.c b/fs/ceph/ceph_fs.c
new file mode 100644
index 000000000000..79d76bc4303f
--- /dev/null
+++ b/fs/ceph/ceph_fs.c
@@ -0,0 +1,74 @@
1/*
2 * Some non-inline ceph helpers
3 */
4#include "types.h"
5
6/*
7 * return true if @layout appears to be valid
8 */
9int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
10{
11 __u32 su = le32_to_cpu(layout->fl_stripe_unit);
12 __u32 sc = le32_to_cpu(layout->fl_stripe_count);
13 __u32 os = le32_to_cpu(layout->fl_object_size);
14
15 /* stripe unit, object size must be non-zero, 64k increment */
16 if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1)))
17 return 0;
18 if (!os || (os & (CEPH_MIN_STRIPE_UNIT-1)))
19 return 0;
20 /* object size must be a multiple of stripe unit */
21 if (os < su || os % su)
22 return 0;
23 /* stripe count must be non-zero */
24 if (!sc)
25 return 0;
26 return 1;
27}
28
29
30int ceph_flags_to_mode(int flags)
31{
32#ifdef O_DIRECTORY /* fixme */
33 if ((flags & O_DIRECTORY) == O_DIRECTORY)
34 return CEPH_FILE_MODE_PIN;
35#endif
36#ifdef O_LAZY
37 if (flags & O_LAZY)
38 return CEPH_FILE_MODE_LAZY;
39#endif
40 if ((flags & O_APPEND) == O_APPEND)
41 flags |= O_WRONLY;
42
43 flags &= O_ACCMODE;
44 if ((flags & O_RDWR) == O_RDWR)
45 return CEPH_FILE_MODE_RDWR;
46 if ((flags & O_WRONLY) == O_WRONLY)
47 return CEPH_FILE_MODE_WR;
48 return CEPH_FILE_MODE_RD;
49}
50
51int ceph_caps_for_mode(int mode)
52{
53 switch (mode) {
54 case CEPH_FILE_MODE_PIN:
55 return CEPH_CAP_PIN;
56 case CEPH_FILE_MODE_RD:
57 return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
58 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE;
59 case CEPH_FILE_MODE_RDWR:
60 return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
61 CEPH_CAP_FILE_EXCL |
62 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE |
63 CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
64 CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
65 CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
66 case CEPH_FILE_MODE_WR:
67 return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
68 CEPH_CAP_FILE_EXCL |
69 CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
70 CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
71 CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
72 }
73 return 0;
74}
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
new file mode 100644
index 000000000000..0c2241ef3653
--- /dev/null
+++ b/fs/ceph/ceph_fs.h
@@ -0,0 +1,650 @@
1/*
2 * ceph_fs.h - Ceph constants and data types to share between kernel and
3 * user space.
4 *
5 * Most types in this file are defined as little-endian, and are
6 * primarily intended to describe data structures that pass over the
7 * wire or that are stored on disk.
8 *
9 * LGPL2
10 */
11
12#ifndef _FS_CEPH_CEPH_FS_H
13#define _FS_CEPH_CEPH_FS_H
14
15#include "msgr.h"
16#include "rados.h"
17
18/*
19 * Ceph release version
20 */
21#define CEPH_VERSION_MAJOR 0
22#define CEPH_VERSION_MINOR 19
23#define CEPH_VERSION_PATCH 0
24
25#define _CEPH_STRINGIFY(x) #x
26#define CEPH_STRINGIFY(x) _CEPH_STRINGIFY(x)
27#define CEPH_MAKE_VERSION(x, y, z) CEPH_STRINGIFY(x) "." CEPH_STRINGIFY(y) \
28 "." CEPH_STRINGIFY(z)
29#define CEPH_VERSION CEPH_MAKE_VERSION(CEPH_VERSION_MAJOR, \
30 CEPH_VERSION_MINOR, CEPH_VERSION_PATCH)
31
32/*
33 * subprotocol versions. when specific messages types or high-level
34 * protocols change, bump the affected components. we keep rev
35 * internal cluster protocols separately from the public,
36 * client-facing protocol.
37 */
38#define CEPH_OSD_PROTOCOL 8 /* cluster internal */
39#define CEPH_MDS_PROTOCOL 9 /* cluster internal */
40#define CEPH_MON_PROTOCOL 5 /* cluster internal */
41#define CEPH_OSDC_PROTOCOL 24 /* server/client */
42#define CEPH_MDSC_PROTOCOL 32 /* server/client */
43#define CEPH_MONC_PROTOCOL 15 /* server/client */
44
45
46#define CEPH_INO_ROOT 1
47#define CEPH_INO_CEPH 2 /* hidden .ceph dir */
48
49/* arbitrary limit on max # of monitors (cluster of 3 is typical) */
50#define CEPH_MAX_MON 31
51
52
53/*
54 * feature bits
55 */
56#define CEPH_FEATURE_SUPPORTED 0
57#define CEPH_FEATURE_REQUIRED 0
58
59
60/*
61 * ceph_file_layout - describe data layout for a file/inode
62 */
63struct ceph_file_layout {
64 /* file -> object mapping */
65 __le32 fl_stripe_unit; /* stripe unit, in bytes. must be multiple
66 of page size. */
67 __le32 fl_stripe_count; /* over this many objects */
68 __le32 fl_object_size; /* until objects are this big, then move to
69 new objects */
70 __le32 fl_cas_hash; /* 0 = none; 1 = sha256 */
71
72 /* pg -> disk layout */
73 __le32 fl_object_stripe_unit; /* for per-object parity, if any */
74
75 /* object -> pg layout */
76 __le32 fl_pg_preferred; /* preferred primary for pg (-1 for none) */
77 __le32 fl_pg_pool; /* namespace, crush ruleset, rep level */
78} __attribute__ ((packed));
79
80#define CEPH_MIN_STRIPE_UNIT 65536
81
82int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
83
84
85/* crypto algorithms */
86#define CEPH_CRYPTO_NONE 0x0
87#define CEPH_CRYPTO_AES 0x1
88
89/* security/authentication protocols */
90#define CEPH_AUTH_UNKNOWN 0x0
91#define CEPH_AUTH_NONE 0x1
92#define CEPH_AUTH_CEPHX 0x2
93
94
95/*********************************************
96 * message layer
97 */
98
99/*
100 * message types
101 */
102
103/* misc */
104#define CEPH_MSG_SHUTDOWN 1
105#define CEPH_MSG_PING 2
106
107/* client <-> monitor */
108#define CEPH_MSG_MON_MAP 4
109#define CEPH_MSG_MON_GET_MAP 5
110#define CEPH_MSG_STATFS 13
111#define CEPH_MSG_STATFS_REPLY 14
112#define CEPH_MSG_MON_SUBSCRIBE 15
113#define CEPH_MSG_MON_SUBSCRIBE_ACK 16
114#define CEPH_MSG_AUTH 17
115#define CEPH_MSG_AUTH_REPLY 18
116
117/* client <-> mds */
118#define CEPH_MSG_MDS_MAP 21
119
120#define CEPH_MSG_CLIENT_SESSION 22
121#define CEPH_MSG_CLIENT_RECONNECT 23
122
123#define CEPH_MSG_CLIENT_REQUEST 24
124#define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
125#define CEPH_MSG_CLIENT_REPLY 26
126#define CEPH_MSG_CLIENT_CAPS 0x310
127#define CEPH_MSG_CLIENT_LEASE 0x311
128#define CEPH_MSG_CLIENT_SNAP 0x312
129#define CEPH_MSG_CLIENT_CAPRELEASE 0x313
130
131/* osd */
132#define CEPH_MSG_OSD_MAP 41
133#define CEPH_MSG_OSD_OP 42
134#define CEPH_MSG_OSD_OPREPLY 43
135
136struct ceph_mon_request_header {
137 __le64 have_version;
138 __le16 session_mon;
139 __le64 session_mon_tid;
140} __attribute__ ((packed));
141
142struct ceph_mon_statfs {
143 struct ceph_mon_request_header monhdr;
144 struct ceph_fsid fsid;
145} __attribute__ ((packed));
146
147struct ceph_statfs {
148 __le64 kb, kb_used, kb_avail;
149 __le64 num_objects;
150} __attribute__ ((packed));
151
152struct ceph_mon_statfs_reply {
153 struct ceph_fsid fsid;
154 __le64 version;
155 struct ceph_statfs st;
156} __attribute__ ((packed));
157
158struct ceph_osd_getmap {
159 struct ceph_mon_request_header monhdr;
160 struct ceph_fsid fsid;
161 __le32 start;
162} __attribute__ ((packed));
163
164struct ceph_mds_getmap {
165 struct ceph_mon_request_header monhdr;
166 struct ceph_fsid fsid;
167} __attribute__ ((packed));
168
169struct ceph_client_mount {
170 struct ceph_mon_request_header monhdr;
171} __attribute__ ((packed));
172
173struct ceph_mon_subscribe_item {
174 __le64 have_version; __le64 have;
175 __u8 onetime;
176} __attribute__ ((packed));
177
178struct ceph_mon_subscribe_ack {
179 __le32 duration; /* seconds */
180 struct ceph_fsid fsid;
181} __attribute__ ((packed));
182
183/*
184 * mds states
185 * > 0 -> in
186 * <= 0 -> out
187 */
188#define CEPH_MDS_STATE_DNE 0 /* down, does not exist. */
189#define CEPH_MDS_STATE_STOPPED -1 /* down, once existed, but no subtrees.
190 empty log. */
191#define CEPH_MDS_STATE_BOOT -4 /* up, boot announcement. */
192#define CEPH_MDS_STATE_STANDBY -5 /* up, idle. waiting for assignment. */
193#define CEPH_MDS_STATE_CREATING -6 /* up, creating MDS instance. */
194#define CEPH_MDS_STATE_STARTING -7 /* up, starting previously stopped mds */
195#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */
196
197#define CEPH_MDS_STATE_REPLAY 8 /* up, replaying journal. */
198#define CEPH_MDS_STATE_RESOLVE 9 /* up, disambiguating distributed
199 operations (import, rename, etc.) */
200#define CEPH_MDS_STATE_RECONNECT 10 /* up, reconnect to clients */
201#define CEPH_MDS_STATE_REJOIN 11 /* up, rejoining distributed cache */
202#define CEPH_MDS_STATE_CLIENTREPLAY 12 /* up, replaying client operations */
203#define CEPH_MDS_STATE_ACTIVE 13 /* up, active */
204#define CEPH_MDS_STATE_STOPPING 14 /* up, but exporting metadata */
205
206extern const char *ceph_mds_state_name(int s);
207
208
209/*
210 * metadata lock types.
211 * - these are bitmasks.. we can compose them
212 * - they also define the lock ordering by the MDS
213 * - a few of these are internal to the mds
214 */
215#define CEPH_LOCK_DN 1
216#define CEPH_LOCK_ISNAP 2
217#define CEPH_LOCK_IVERSION 4 /* mds internal */
218#define CEPH_LOCK_IFILE 8 /* mds internal */
219#define CEPH_LOCK_IAUTH 32
220#define CEPH_LOCK_ILINK 64
221#define CEPH_LOCK_IDFT 128 /* dir frag tree */
222#define CEPH_LOCK_INEST 256 /* mds internal */
223#define CEPH_LOCK_IXATTR 512
224#define CEPH_LOCK_INO 2048 /* immutable inode bits; not a lock */
225
226/* client_session ops */
227enum {
228 CEPH_SESSION_REQUEST_OPEN,
229 CEPH_SESSION_OPEN,
230 CEPH_SESSION_REQUEST_CLOSE,
231 CEPH_SESSION_CLOSE,
232 CEPH_SESSION_REQUEST_RENEWCAPS,
233 CEPH_SESSION_RENEWCAPS,
234 CEPH_SESSION_STALE,
235 CEPH_SESSION_RECALL_STATE,
236};
237
238extern const char *ceph_session_op_name(int op);
239
240struct ceph_mds_session_head {
241 __le32 op;
242 __le64 seq;
243 struct ceph_timespec stamp;
244 __le32 max_caps, max_leases;
245} __attribute__ ((packed));
246
247/* client_request */
248/*
249 * metadata ops.
250 * & 0x001000 -> write op
251 * & 0x010000 -> follow symlink (e.g. stat(), not lstat()).
252 & & 0x100000 -> use weird ino/path trace
253 */
254#define CEPH_MDS_OP_WRITE 0x001000
255enum {
256 CEPH_MDS_OP_LOOKUP = 0x00100,
257 CEPH_MDS_OP_GETATTR = 0x00101,
258 CEPH_MDS_OP_LOOKUPHASH = 0x00102,
259 CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
260
261 CEPH_MDS_OP_SETXATTR = 0x01105,
262 CEPH_MDS_OP_RMXATTR = 0x01106,
263 CEPH_MDS_OP_SETLAYOUT = 0x01107,
264 CEPH_MDS_OP_SETATTR = 0x01108,
265
266 CEPH_MDS_OP_MKNOD = 0x01201,
267 CEPH_MDS_OP_LINK = 0x01202,
268 CEPH_MDS_OP_UNLINK = 0x01203,
269 CEPH_MDS_OP_RENAME = 0x01204,
270 CEPH_MDS_OP_MKDIR = 0x01220,
271 CEPH_MDS_OP_RMDIR = 0x01221,
272 CEPH_MDS_OP_SYMLINK = 0x01222,
273
274 CEPH_MDS_OP_CREATE = 0x01301,
275 CEPH_MDS_OP_OPEN = 0x00302,
276 CEPH_MDS_OP_READDIR = 0x00305,
277
278 CEPH_MDS_OP_LOOKUPSNAP = 0x00400,
279 CEPH_MDS_OP_MKSNAP = 0x01400,
280 CEPH_MDS_OP_RMSNAP = 0x01401,
281 CEPH_MDS_OP_LSSNAP = 0x00402,
282};
283
284extern const char *ceph_mds_op_name(int op);
285
286
287#define CEPH_SETATTR_MODE 1
288#define CEPH_SETATTR_UID 2
289#define CEPH_SETATTR_GID 4
290#define CEPH_SETATTR_MTIME 8
291#define CEPH_SETATTR_ATIME 16
292#define CEPH_SETATTR_SIZE 32
293#define CEPH_SETATTR_CTIME 64
294
295union ceph_mds_request_args {
296 struct {
297 __le32 mask; /* CEPH_CAP_* */
298 } __attribute__ ((packed)) getattr;
299 struct {
300 __le32 mode;
301 __le32 uid;
302 __le32 gid;
303 struct ceph_timespec mtime;
304 struct ceph_timespec atime;
305 __le64 size, old_size; /* old_size needed by truncate */
306 __le32 mask; /* CEPH_SETATTR_* */
307 } __attribute__ ((packed)) setattr;
308 struct {
309 __le32 frag; /* which dir fragment */
310 __le32 max_entries; /* how many dentries to grab */
311 } __attribute__ ((packed)) readdir;
312 struct {
313 __le32 mode;
314 __le32 rdev;
315 } __attribute__ ((packed)) mknod;
316 struct {
317 __le32 mode;
318 } __attribute__ ((packed)) mkdir;
319 struct {
320 __le32 flags;
321 __le32 mode;
322 __le32 stripe_unit; /* layout for newly created file */
323 __le32 stripe_count; /* ... */
324 __le32 object_size;
325 __le32 file_replication;
326 __le32 preferred;
327 } __attribute__ ((packed)) open;
328 struct {
329 __le32 flags;
330 } __attribute__ ((packed)) setxattr;
331 struct {
332 struct ceph_file_layout layout;
333 } __attribute__ ((packed)) setlayout;
334} __attribute__ ((packed));
335
336#define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */
337#define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */
338
339struct ceph_mds_request_head {
340 __le64 oldest_client_tid;
341 __le32 mdsmap_epoch; /* on client */
342 __le32 flags; /* CEPH_MDS_FLAG_* */
343 __u8 num_retry, num_fwd; /* count retry, fwd attempts */
344 __le16 num_releases; /* # include cap/lease release records */
345 __le32 op; /* mds op code */
346 __le32 caller_uid, caller_gid;
347 __le64 ino; /* use this ino for openc, mkdir, mknod,
348 etc. (if replaying) */
349 union ceph_mds_request_args args;
350} __attribute__ ((packed));
351
352/* cap/lease release record */
353struct ceph_mds_request_release {
354 __le64 ino, cap_id; /* ino and unique cap id */
355 __le32 caps, wanted; /* new issued, wanted */
356 __le32 seq, issue_seq, mseq;
357 __le32 dname_seq; /* if releasing a dentry lease, a */
358 __le32 dname_len; /* string follows. */
359} __attribute__ ((packed));
360
361/* client reply */
362struct ceph_mds_reply_head {
363 __le32 op;
364 __le32 result;
365 __le32 mdsmap_epoch;
366 __u8 safe; /* true if committed to disk */
367 __u8 is_dentry, is_target; /* true if dentry, target inode records
368 are included with reply */
369} __attribute__ ((packed));
370
371/* one for each node split */
372struct ceph_frag_tree_split {
373 __le32 frag; /* this frag splits... */
374 __le32 by; /* ...by this many bits */
375} __attribute__ ((packed));
376
377struct ceph_frag_tree_head {
378 __le32 nsplits; /* num ceph_frag_tree_split records */
379 struct ceph_frag_tree_split splits[];
380} __attribute__ ((packed));
381
382/* capability issue, for bundling with mds reply */
383struct ceph_mds_reply_cap {
384 __le32 caps, wanted; /* caps issued, wanted */
385 __le64 cap_id;
386 __le32 seq, mseq;
387 __le64 realm; /* snap realm */
388 __u8 flags; /* CEPH_CAP_FLAG_* */
389} __attribute__ ((packed));
390
391#define CEPH_CAP_FLAG_AUTH 1 /* cap is issued by auth mds */
392
393/* inode record, for bundling with mds reply */
394struct ceph_mds_reply_inode {
395 __le64 ino;
396 __le64 snapid;
397 __le32 rdev;
398 __le64 version; /* inode version */
399 __le64 xattr_version; /* version for xattr blob */
400 struct ceph_mds_reply_cap cap; /* caps issued for this inode */
401 struct ceph_file_layout layout;
402 struct ceph_timespec ctime, mtime, atime;
403 __le32 time_warp_seq;
404 __le64 size, max_size, truncate_size;
405 __le32 truncate_seq;
406 __le32 mode, uid, gid;
407 __le32 nlink;
408 __le64 files, subdirs, rbytes, rfiles, rsubdirs; /* dir stats */
409 struct ceph_timespec rctime;
410 struct ceph_frag_tree_head fragtree; /* (must be at end of struct) */
411} __attribute__ ((packed));
412/* followed by frag array, then symlink string, then xattr blob */
413
414/* reply_lease follows dname, and reply_inode */
415struct ceph_mds_reply_lease {
416 __le16 mask; /* lease type(s) */
417 __le32 duration_ms; /* lease duration */
418 __le32 seq;
419} __attribute__ ((packed));
420
421struct ceph_mds_reply_dirfrag {
422 __le32 frag; /* fragment */
423 __le32 auth; /* auth mds, if this is a delegation point */
424 __le32 ndist; /* number of mds' this is replicated on */
425 __le32 dist[];
426} __attribute__ ((packed));
427
428/* file access modes */
429#define CEPH_FILE_MODE_PIN 0
430#define CEPH_FILE_MODE_RD 1
431#define CEPH_FILE_MODE_WR 2
432#define CEPH_FILE_MODE_RDWR 3 /* RD | WR */
433#define CEPH_FILE_MODE_LAZY 4 /* lazy io */
434#define CEPH_FILE_MODE_NUM 8 /* bc these are bit fields.. mostly */
435
436int ceph_flags_to_mode(int flags);
437
438
439/* capability bits */
440#define CEPH_CAP_PIN 1 /* no specific capabilities beyond the pin */
441
442/* generic cap bits */
443#define CEPH_CAP_GSHARED 1 /* client can reads */
444#define CEPH_CAP_GEXCL 2 /* client can read and update */
445#define CEPH_CAP_GCACHE 4 /* (file) client can cache reads */
446#define CEPH_CAP_GRD 8 /* (file) client can read */
447#define CEPH_CAP_GWR 16 /* (file) client can write */
448#define CEPH_CAP_GBUFFER 32 /* (file) client can buffer writes */
449#define CEPH_CAP_GWREXTEND 64 /* (file) client can extend EOF */
450#define CEPH_CAP_GLAZYIO 128 /* (file) client can perform lazy io */
451
452/* per-lock shift */
453#define CEPH_CAP_SAUTH 2
454#define CEPH_CAP_SLINK 4
455#define CEPH_CAP_SXATTR 6
456#define CEPH_CAP_SFILE 8 /* goes at the end (uses >2 cap bits) */
457
458#define CEPH_CAP_BITS 16
459
460/* composed values */
461#define CEPH_CAP_AUTH_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SAUTH)
462#define CEPH_CAP_AUTH_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SAUTH)
463#define CEPH_CAP_LINK_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SLINK)
464#define CEPH_CAP_LINK_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SLINK)
465#define CEPH_CAP_XATTR_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SXATTR)
466#define CEPH_CAP_XATTR_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SXATTR)
467#define CEPH_CAP_FILE(x) (x << CEPH_CAP_SFILE)
468#define CEPH_CAP_FILE_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SFILE)
469#define CEPH_CAP_FILE_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SFILE)
470#define CEPH_CAP_FILE_CACHE (CEPH_CAP_GCACHE << CEPH_CAP_SFILE)
471#define CEPH_CAP_FILE_RD (CEPH_CAP_GRD << CEPH_CAP_SFILE)
472#define CEPH_CAP_FILE_WR (CEPH_CAP_GWR << CEPH_CAP_SFILE)
473#define CEPH_CAP_FILE_BUFFER (CEPH_CAP_GBUFFER << CEPH_CAP_SFILE)
474#define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE)
475#define CEPH_CAP_FILE_LAZYIO (CEPH_CAP_GLAZYIO << CEPH_CAP_SFILE)
476
477/* cap masks (for getattr) */
478#define CEPH_STAT_CAP_INODE CEPH_CAP_PIN
479#define CEPH_STAT_CAP_TYPE CEPH_CAP_PIN /* mode >> 12 */
480#define CEPH_STAT_CAP_SYMLINK CEPH_CAP_PIN
481#define CEPH_STAT_CAP_UID CEPH_CAP_AUTH_SHARED
482#define CEPH_STAT_CAP_GID CEPH_CAP_AUTH_SHARED
483#define CEPH_STAT_CAP_MODE CEPH_CAP_AUTH_SHARED
484#define CEPH_STAT_CAP_NLINK CEPH_CAP_LINK_SHARED
485#define CEPH_STAT_CAP_LAYOUT CEPH_CAP_FILE_SHARED
486#define CEPH_STAT_CAP_MTIME CEPH_CAP_FILE_SHARED
487#define CEPH_STAT_CAP_SIZE CEPH_CAP_FILE_SHARED
488#define CEPH_STAT_CAP_ATIME CEPH_CAP_FILE_SHARED /* fixme */
489#define CEPH_STAT_CAP_XATTR CEPH_CAP_XATTR_SHARED
490#define CEPH_STAT_CAP_INODE_ALL (CEPH_CAP_PIN | \
491 CEPH_CAP_AUTH_SHARED | \
492 CEPH_CAP_LINK_SHARED | \
493 CEPH_CAP_FILE_SHARED | \
494 CEPH_CAP_XATTR_SHARED)
495
496#define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED | \
497 CEPH_CAP_LINK_SHARED | \
498 CEPH_CAP_XATTR_SHARED | \
499 CEPH_CAP_FILE_SHARED)
500#define CEPH_CAP_ANY_RD (CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_RD | \
501 CEPH_CAP_FILE_CACHE)
502
503#define CEPH_CAP_ANY_EXCL (CEPH_CAP_AUTH_EXCL | \
504 CEPH_CAP_LINK_EXCL | \
505 CEPH_CAP_XATTR_EXCL | \
506 CEPH_CAP_FILE_EXCL)
507#define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | \
508 CEPH_CAP_FILE_EXCL)
509#define CEPH_CAP_ANY_WR (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR)
510#define CEPH_CAP_ANY (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
511 CEPH_CAP_ANY_FILE_WR | CEPH_CAP_PIN)
512
513#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
514 CEPH_LOCK_IXATTR)
515
516int ceph_caps_for_mode(int mode);
517
518enum {
519 CEPH_CAP_OP_GRANT, /* mds->client grant */
520 CEPH_CAP_OP_REVOKE, /* mds->client revoke */
521 CEPH_CAP_OP_TRUNC, /* mds->client trunc notify */
522 CEPH_CAP_OP_EXPORT, /* mds has exported the cap */
523 CEPH_CAP_OP_IMPORT, /* mds has imported the cap */
524 CEPH_CAP_OP_UPDATE, /* client->mds update */
525 CEPH_CAP_OP_DROP, /* client->mds drop cap bits */
526 CEPH_CAP_OP_FLUSH, /* client->mds cap writeback */
527 CEPH_CAP_OP_FLUSH_ACK, /* mds->client flushed */
528 CEPH_CAP_OP_FLUSHSNAP, /* client->mds flush snapped metadata */
529 CEPH_CAP_OP_FLUSHSNAP_ACK, /* mds->client flushed snapped metadata */
530 CEPH_CAP_OP_RELEASE, /* client->mds release (clean) cap */
531 CEPH_CAP_OP_RENEW, /* client->mds renewal request */
532};
533
534extern const char *ceph_cap_op_name(int op);
535
536/*
537 * caps message, used for capability callbacks, acks, requests, etc.
538 */
539struct ceph_mds_caps {
540 __le32 op; /* CEPH_CAP_OP_* */
541 __le64 ino, realm;
542 __le64 cap_id;
543 __le32 seq, issue_seq;
544 __le32 caps, wanted, dirty; /* latest issued/wanted/dirty */
545 __le32 migrate_seq;
546 __le64 snap_follows;
547 __le32 snap_trace_len;
548
549 /* authlock */
550 __le32 uid, gid, mode;
551
552 /* linklock */
553 __le32 nlink;
554
555 /* xattrlock */
556 __le32 xattr_len;
557 __le64 xattr_version;
558
559 /* filelock */
560 __le64 size, max_size, truncate_size;
561 __le32 truncate_seq;
562 struct ceph_timespec mtime, atime, ctime;
563 struct ceph_file_layout layout;
564 __le32 time_warp_seq;
565} __attribute__ ((packed));
566
567/* cap release msg head */
568struct ceph_mds_cap_release {
569 __le32 num; /* number of cap_items that follow */
570} __attribute__ ((packed));
571
572struct ceph_mds_cap_item {
573 __le64 ino;
574 __le64 cap_id;
575 __le32 migrate_seq, seq;
576} __attribute__ ((packed));
577
578#define CEPH_MDS_LEASE_REVOKE 1 /* mds -> client */
579#define CEPH_MDS_LEASE_RELEASE 2 /* client -> mds */
580#define CEPH_MDS_LEASE_RENEW 3 /* client <-> mds */
581#define CEPH_MDS_LEASE_REVOKE_ACK 4 /* client -> mds */
582
583extern const char *ceph_lease_op_name(int o);
584
585/* lease msg header */
586struct ceph_mds_lease {
587 __u8 action; /* CEPH_MDS_LEASE_* */
588 __le16 mask; /* which lease */
589 __le64 ino;
590 __le64 first, last; /* snap range */
591 __le32 seq;
592 __le32 duration_ms; /* duration of renewal */
593} __attribute__ ((packed));
594/* followed by a __le32+string for dname */
595
596/* client reconnect */
597struct ceph_mds_cap_reconnect {
598 __le64 cap_id;
599 __le32 wanted;
600 __le32 issued;
601 __le64 size;
602 struct ceph_timespec mtime, atime;
603 __le64 snaprealm;
604 __le64 pathbase; /* base ino for our path to this ino */
605} __attribute__ ((packed));
606/* followed by encoded string */
607
608struct ceph_mds_snaprealm_reconnect {
609 __le64 ino; /* snap realm base */
610 __le64 seq; /* snap seq for this snap realm */
611 __le64 parent; /* parent realm */
612} __attribute__ ((packed));
613
614/*
615 * snaps
616 */
617enum {
618 CEPH_SNAP_OP_UPDATE, /* CREATE or DESTROY */
619 CEPH_SNAP_OP_CREATE,
620 CEPH_SNAP_OP_DESTROY,
621 CEPH_SNAP_OP_SPLIT,
622};
623
624extern const char *ceph_snap_op_name(int o);
625
626/* snap msg header */
627struct ceph_mds_snap_head {
628 __le32 op; /* CEPH_SNAP_OP_* */
629 __le64 split; /* ino to split off, if any */
630 __le32 num_split_inos; /* # inos belonging to new child realm */
631 __le32 num_split_realms; /* # child realms udner new child realm */
632 __le32 trace_len; /* size of snap trace blob */
633} __attribute__ ((packed));
634/* followed by split ino list, then split realms, then the trace blob */
635
636/*
637 * encode info about a snaprealm, as viewed by a client
638 */
639struct ceph_mds_snap_realm {
640 __le64 ino; /* ino */
641 __le64 created; /* snap: when created */
642 __le64 parent; /* ino: parent realm */
643 __le64 parent_since; /* snap: same parent since */
644 __le64 seq; /* snap: version */
645 __le32 num_snaps;
646 __le32 num_prior_parent_snaps;
647} __attribute__ ((packed));
648/* followed by my snap list, then prior parent snap list */
649
650#endif
diff --git a/fs/ceph/ceph_hash.c b/fs/ceph/ceph_hash.c
new file mode 100644
index 000000000000..bd570015d147
--- /dev/null
+++ b/fs/ceph/ceph_hash.c
@@ -0,0 +1,118 @@
1
2#include "types.h"
3
4/*
5 * Robert Jenkin's hash function.
6 * http://burtleburtle.net/bob/hash/evahash.html
7 * This is in the public domain.
8 */
9#define mix(a, b, c) \
10 do { \
11 a = a - b; a = a - c; a = a ^ (c >> 13); \
12 b = b - c; b = b - a; b = b ^ (a << 8); \
13 c = c - a; c = c - b; c = c ^ (b >> 13); \
14 a = a - b; a = a - c; a = a ^ (c >> 12); \
15 b = b - c; b = b - a; b = b ^ (a << 16); \
16 c = c - a; c = c - b; c = c ^ (b >> 5); \
17 a = a - b; a = a - c; a = a ^ (c >> 3); \
18 b = b - c; b = b - a; b = b ^ (a << 10); \
19 c = c - a; c = c - b; c = c ^ (b >> 15); \
20 } while (0)
21
22unsigned ceph_str_hash_rjenkins(const char *str, unsigned length)
23{
24 const unsigned char *k = (const unsigned char *)str;
25 __u32 a, b, c; /* the internal state */
26 __u32 len; /* how many key bytes still need mixing */
27
28 /* Set up the internal state */
29 len = length;
30 a = 0x9e3779b9; /* the golden ratio; an arbitrary value */
31 b = a;
32 c = 0; /* variable initialization of internal state */
33
34 /* handle most of the key */
35 while (len >= 12) {
36 a = a + (k[0] + ((__u32)k[1] << 8) + ((__u32)k[2] << 16) +
37 ((__u32)k[3] << 24));
38 b = b + (k[4] + ((__u32)k[5] << 8) + ((__u32)k[6] << 16) +
39 ((__u32)k[7] << 24));
40 c = c + (k[8] + ((__u32)k[9] << 8) + ((__u32)k[10] << 16) +
41 ((__u32)k[11] << 24));
42 mix(a, b, c);
43 k = k + 12;
44 len = len - 12;
45 }
46
47 /* handle the last 11 bytes */
48 c = c + length;
49 switch (len) { /* all the case statements fall through */
50 case 11:
51 c = c + ((__u32)k[10] << 24);
52 case 10:
53 c = c + ((__u32)k[9] << 16);
54 case 9:
55 c = c + ((__u32)k[8] << 8);
56 /* the first byte of c is reserved for the length */
57 case 8:
58 b = b + ((__u32)k[7] << 24);
59 case 7:
60 b = b + ((__u32)k[6] << 16);
61 case 6:
62 b = b + ((__u32)k[5] << 8);
63 case 5:
64 b = b + k[4];
65 case 4:
66 a = a + ((__u32)k[3] << 24);
67 case 3:
68 a = a + ((__u32)k[2] << 16);
69 case 2:
70 a = a + ((__u32)k[1] << 8);
71 case 1:
72 a = a + k[0];
73 /* case 0: nothing left to add */
74 }
75 mix(a, b, c);
76
77 return c;
78}
79
80/*
81 * linux dcache hash
82 */
83unsigned ceph_str_hash_linux(const char *str, unsigned length)
84{
85 unsigned long hash = 0;
86 unsigned char c;
87
88 while (length--) {
89 c = *str++;
90 hash = (hash + (c << 4) + (c >> 4)) * 11;
91 }
92 return hash;
93}
94
95
96unsigned ceph_str_hash(int type, const char *s, unsigned len)
97{
98 switch (type) {
99 case CEPH_STR_HASH_LINUX:
100 return ceph_str_hash_linux(s, len);
101 case CEPH_STR_HASH_RJENKINS:
102 return ceph_str_hash_rjenkins(s, len);
103 default:
104 return -1;
105 }
106}
107
108const char *ceph_str_hash_name(int type)
109{
110 switch (type) {
111 case CEPH_STR_HASH_LINUX:
112 return "linux";
113 case CEPH_STR_HASH_RJENKINS:
114 return "rjenkins";
115 default:
116 return "unknown";
117 }
118}
diff --git a/fs/ceph/ceph_hash.h b/fs/ceph/ceph_hash.h
new file mode 100644
index 000000000000..5ac470c433c9
--- /dev/null
+++ b/fs/ceph/ceph_hash.h
@@ -0,0 +1,13 @@
1#ifndef _FS_CEPH_HASH_H
2#define _FS_CEPH_HASH_H
3
4#define CEPH_STR_HASH_LINUX 0x1 /* linux dcache hash */
5#define CEPH_STR_HASH_RJENKINS 0x2 /* robert jenkins' */
6
7extern unsigned ceph_str_hash_linux(const char *s, unsigned len);
8extern unsigned ceph_str_hash_rjenkins(const char *s, unsigned len);
9
10extern unsigned ceph_str_hash(int type, const char *s, unsigned len);
11extern const char *ceph_str_hash_name(int type);
12
13#endif
diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/ceph_strings.c
new file mode 100644
index 000000000000..8e4be6a80c62
--- /dev/null
+++ b/fs/ceph/ceph_strings.c
@@ -0,0 +1,176 @@
1/*
2 * Ceph string constants
3 */
4#include "types.h"
5
6const char *ceph_entity_type_name(int type)
7{
8 switch (type) {
9 case CEPH_ENTITY_TYPE_MDS: return "mds";
10 case CEPH_ENTITY_TYPE_OSD: return "osd";
11 case CEPH_ENTITY_TYPE_MON: return "mon";
12 case CEPH_ENTITY_TYPE_CLIENT: return "client";
13 case CEPH_ENTITY_TYPE_ADMIN: return "admin";
14 case CEPH_ENTITY_TYPE_AUTH: return "auth";
15 default: return "unknown";
16 }
17}
18
19const char *ceph_osd_op_name(int op)
20{
21 switch (op) {
22 case CEPH_OSD_OP_READ: return "read";
23 case CEPH_OSD_OP_STAT: return "stat";
24
25 case CEPH_OSD_OP_MASKTRUNC: return "masktrunc";
26
27 case CEPH_OSD_OP_WRITE: return "write";
28 case CEPH_OSD_OP_DELETE: return "delete";
29 case CEPH_OSD_OP_TRUNCATE: return "truncate";
30 case CEPH_OSD_OP_ZERO: return "zero";
31 case CEPH_OSD_OP_WRITEFULL: return "writefull";
32
33 case CEPH_OSD_OP_APPEND: return "append";
34 case CEPH_OSD_OP_STARTSYNC: return "startsync";
35 case CEPH_OSD_OP_SETTRUNC: return "settrunc";
36 case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc";
37
38 case CEPH_OSD_OP_TMAPUP: return "tmapup";
39 case CEPH_OSD_OP_TMAPGET: return "tmapget";
40 case CEPH_OSD_OP_TMAPPUT: return "tmapput";
41
42 case CEPH_OSD_OP_GETXATTR: return "getxattr";
43 case CEPH_OSD_OP_GETXATTRS: return "getxattrs";
44 case CEPH_OSD_OP_SETXATTR: return "setxattr";
45 case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
46 case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
47 case CEPH_OSD_OP_RMXATTR: return "rmxattr";
48
49 case CEPH_OSD_OP_PULL: return "pull";
50 case CEPH_OSD_OP_PUSH: return "push";
51 case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";
52 case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";
53 case CEPH_OSD_OP_SCRUB: return "scrub";
54
55 case CEPH_OSD_OP_WRLOCK: return "wrlock";
56 case CEPH_OSD_OP_WRUNLOCK: return "wrunlock";
57 case CEPH_OSD_OP_RDLOCK: return "rdlock";
58 case CEPH_OSD_OP_RDUNLOCK: return "rdunlock";
59 case CEPH_OSD_OP_UPLOCK: return "uplock";
60 case CEPH_OSD_OP_DNLOCK: return "dnlock";
61
62 case CEPH_OSD_OP_CALL: return "call";
63
64 case CEPH_OSD_OP_PGLS: return "pgls";
65 }
66 return "???";
67}
68
69const char *ceph_mds_state_name(int s)
70{
71 switch (s) {
72 /* down and out */
73 case CEPH_MDS_STATE_DNE: return "down:dne";
74 case CEPH_MDS_STATE_STOPPED: return "down:stopped";
75 /* up and out */
76 case CEPH_MDS_STATE_BOOT: return "up:boot";
77 case CEPH_MDS_STATE_STANDBY: return "up:standby";
78 case CEPH_MDS_STATE_STANDBY_REPLAY: return "up:standby-replay";
79 case CEPH_MDS_STATE_CREATING: return "up:creating";
80 case CEPH_MDS_STATE_STARTING: return "up:starting";
81 /* up and in */
82 case CEPH_MDS_STATE_REPLAY: return "up:replay";
83 case CEPH_MDS_STATE_RESOLVE: return "up:resolve";
84 case CEPH_MDS_STATE_RECONNECT: return "up:reconnect";
85 case CEPH_MDS_STATE_REJOIN: return "up:rejoin";
86 case CEPH_MDS_STATE_CLIENTREPLAY: return "up:clientreplay";
87 case CEPH_MDS_STATE_ACTIVE: return "up:active";
88 case CEPH_MDS_STATE_STOPPING: return "up:stopping";
89 }
90 return "???";
91}
92
93const char *ceph_session_op_name(int op)
94{
95 switch (op) {
96 case CEPH_SESSION_REQUEST_OPEN: return "request_open";
97 case CEPH_SESSION_OPEN: return "open";
98 case CEPH_SESSION_REQUEST_CLOSE: return "request_close";
99 case CEPH_SESSION_CLOSE: return "close";
100 case CEPH_SESSION_REQUEST_RENEWCAPS: return "request_renewcaps";
101 case CEPH_SESSION_RENEWCAPS: return "renewcaps";
102 case CEPH_SESSION_STALE: return "stale";
103 case CEPH_SESSION_RECALL_STATE: return "recall_state";
104 }
105 return "???";
106}
107
108const char *ceph_mds_op_name(int op)
109{
110 switch (op) {
111 case CEPH_MDS_OP_LOOKUP: return "lookup";
112 case CEPH_MDS_OP_LOOKUPHASH: return "lookuphash";
113 case CEPH_MDS_OP_LOOKUPPARENT: return "lookupparent";
114 case CEPH_MDS_OP_GETATTR: return "getattr";
115 case CEPH_MDS_OP_SETXATTR: return "setxattr";
116 case CEPH_MDS_OP_SETATTR: return "setattr";
117 case CEPH_MDS_OP_RMXATTR: return "rmxattr";
118 case CEPH_MDS_OP_READDIR: return "readdir";
119 case CEPH_MDS_OP_MKNOD: return "mknod";
120 case CEPH_MDS_OP_LINK: return "link";
121 case CEPH_MDS_OP_UNLINK: return "unlink";
122 case CEPH_MDS_OP_RENAME: return "rename";
123 case CEPH_MDS_OP_MKDIR: return "mkdir";
124 case CEPH_MDS_OP_RMDIR: return "rmdir";
125 case CEPH_MDS_OP_SYMLINK: return "symlink";
126 case CEPH_MDS_OP_CREATE: return "create";
127 case CEPH_MDS_OP_OPEN: return "open";
128 case CEPH_MDS_OP_LOOKUPSNAP: return "lookupsnap";
129 case CEPH_MDS_OP_LSSNAP: return "lssnap";
130 case CEPH_MDS_OP_MKSNAP: return "mksnap";
131 case CEPH_MDS_OP_RMSNAP: return "rmsnap";
132 }
133 return "???";
134}
135
136const char *ceph_cap_op_name(int op)
137{
138 switch (op) {
139 case CEPH_CAP_OP_GRANT: return "grant";
140 case CEPH_CAP_OP_REVOKE: return "revoke";
141 case CEPH_CAP_OP_TRUNC: return "trunc";
142 case CEPH_CAP_OP_EXPORT: return "export";
143 case CEPH_CAP_OP_IMPORT: return "import";
144 case CEPH_CAP_OP_UPDATE: return "update";
145 case CEPH_CAP_OP_DROP: return "drop";
146 case CEPH_CAP_OP_FLUSH: return "flush";
147 case CEPH_CAP_OP_FLUSH_ACK: return "flush_ack";
148 case CEPH_CAP_OP_FLUSHSNAP: return "flushsnap";
149 case CEPH_CAP_OP_FLUSHSNAP_ACK: return "flushsnap_ack";
150 case CEPH_CAP_OP_RELEASE: return "release";
151 case CEPH_CAP_OP_RENEW: return "renew";
152 }
153 return "???";
154}
155
156const char *ceph_lease_op_name(int o)
157{
158 switch (o) {
159 case CEPH_MDS_LEASE_REVOKE: return "revoke";
160 case CEPH_MDS_LEASE_RELEASE: return "release";
161 case CEPH_MDS_LEASE_RENEW: return "renew";
162 case CEPH_MDS_LEASE_REVOKE_ACK: return "revoke_ack";
163 }
164 return "???";
165}
166
167const char *ceph_snap_op_name(int o)
168{
169 switch (o) {
170 case CEPH_SNAP_OP_UPDATE: return "update";
171 case CEPH_SNAP_OP_CREATE: return "create";
172 case CEPH_SNAP_OP_DESTROY: return "destroy";
173 case CEPH_SNAP_OP_SPLIT: return "split";
174 }
175 return "???";
176}
diff --git a/fs/ceph/crush/crush.c b/fs/ceph/crush/crush.c
new file mode 100644
index 000000000000..fabd302e5779
--- /dev/null
+++ b/fs/ceph/crush/crush.c
@@ -0,0 +1,151 @@
1
2#ifdef __KERNEL__
3# include <linux/slab.h>
4#else
5# include <stdlib.h>
6# include <assert.h>
7# define kfree(x) do { if (x) free(x); } while (0)
8# define BUG_ON(x) assert(!(x))
9#endif
10
11#include "crush.h"
12
13const char *crush_bucket_alg_name(int alg)
14{
15 switch (alg) {
16 case CRUSH_BUCKET_UNIFORM: return "uniform";
17 case CRUSH_BUCKET_LIST: return "list";
18 case CRUSH_BUCKET_TREE: return "tree";
19 case CRUSH_BUCKET_STRAW: return "straw";
20 default: return "unknown";
21 }
22}
23
24/**
25 * crush_get_bucket_item_weight - Get weight of an item in given bucket
26 * @b: bucket pointer
27 * @p: item index in bucket
28 */
29int crush_get_bucket_item_weight(struct crush_bucket *b, int p)
30{
31 if (p >= b->size)
32 return 0;
33
34 switch (b->alg) {
35 case CRUSH_BUCKET_UNIFORM:
36 return ((struct crush_bucket_uniform *)b)->item_weight;
37 case CRUSH_BUCKET_LIST:
38 return ((struct crush_bucket_list *)b)->item_weights[p];
39 case CRUSH_BUCKET_TREE:
40 if (p & 1)
41 return ((struct crush_bucket_tree *)b)->node_weights[p];
42 return 0;
43 case CRUSH_BUCKET_STRAW:
44 return ((struct crush_bucket_straw *)b)->item_weights[p];
45 }
46 return 0;
47}
48
49/**
50 * crush_calc_parents - Calculate parent vectors for the given crush map.
51 * @map: crush_map pointer
52 */
53void crush_calc_parents(struct crush_map *map)
54{
55 int i, b, c;
56
57 for (b = 0; b < map->max_buckets; b++) {
58 if (map->buckets[b] == NULL)
59 continue;
60 for (i = 0; i < map->buckets[b]->size; i++) {
61 c = map->buckets[b]->items[i];
62 BUG_ON(c >= map->max_devices ||
63 c < -map->max_buckets);
64 if (c >= 0)
65 map->device_parents[c] = map->buckets[b]->id;
66 else
67 map->bucket_parents[-1-c] = map->buckets[b]->id;
68 }
69 }
70}
71
72void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
73{
74 kfree(b->h.perm);
75 kfree(b->h.items);
76 kfree(b);
77}
78
79void crush_destroy_bucket_list(struct crush_bucket_list *b)
80{
81 kfree(b->item_weights);
82 kfree(b->sum_weights);
83 kfree(b->h.perm);
84 kfree(b->h.items);
85 kfree(b);
86}
87
88void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
89{
90 kfree(b->node_weights);
91 kfree(b);
92}
93
94void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
95{
96 kfree(b->straws);
97 kfree(b->item_weights);
98 kfree(b->h.perm);
99 kfree(b->h.items);
100 kfree(b);
101}
102
103void crush_destroy_bucket(struct crush_bucket *b)
104{
105 switch (b->alg) {
106 case CRUSH_BUCKET_UNIFORM:
107 crush_destroy_bucket_uniform((struct crush_bucket_uniform *)b);
108 break;
109 case CRUSH_BUCKET_LIST:
110 crush_destroy_bucket_list((struct crush_bucket_list *)b);
111 break;
112 case CRUSH_BUCKET_TREE:
113 crush_destroy_bucket_tree((struct crush_bucket_tree *)b);
114 break;
115 case CRUSH_BUCKET_STRAW:
116 crush_destroy_bucket_straw((struct crush_bucket_straw *)b);
117 break;
118 }
119}
120
121/**
122 * crush_destroy - Destroy a crush_map
123 * @map: crush_map pointer
124 */
125void crush_destroy(struct crush_map *map)
126{
127 int b;
128
129 /* buckets */
130 if (map->buckets) {
131 for (b = 0; b < map->max_buckets; b++) {
132 if (map->buckets[b] == NULL)
133 continue;
134 crush_destroy_bucket(map->buckets[b]);
135 }
136 kfree(map->buckets);
137 }
138
139 /* rules */
140 if (map->rules) {
141 for (b = 0; b < map->max_rules; b++)
142 kfree(map->rules[b]);
143 kfree(map->rules);
144 }
145
146 kfree(map->bucket_parents);
147 kfree(map->device_parents);
148 kfree(map);
149}
150
151
diff --git a/fs/ceph/crush/crush.h b/fs/ceph/crush/crush.h
new file mode 100644
index 000000000000..dcd7e7523700
--- /dev/null
+++ b/fs/ceph/crush/crush.h
@@ -0,0 +1,180 @@
1#ifndef _CRUSH_CRUSH_H
2#define _CRUSH_CRUSH_H
3
4#include <linux/types.h>
5
6/*
7 * CRUSH is a pseudo-random data distribution algorithm that
8 * efficiently distributes input values (typically, data objects)
9 * across a heterogeneous, structured storage cluster.
10 *
11 * The algorithm was originally described in detail in this paper
12 * (although the algorithm has evolved somewhat since then):
13 *
14 * http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf
15 *
16 * LGPL2
17 */
18
19
20#define CRUSH_MAGIC 0x00010000ul /* for detecting algorithm revisions */
21
22
23#define CRUSH_MAX_DEPTH 10 /* max crush hierarchy depth */
24#define CRUSH_MAX_SET 10 /* max size of a mapping result */
25
26
27/*
28 * CRUSH uses user-defined "rules" to describe how inputs should be
29 * mapped to devices. A rule consists of sequence of steps to perform
30 * to generate the set of output devices.
31 */
32struct crush_rule_step {
33 __u32 op;
34 __s32 arg1;
35 __s32 arg2;
36};
37
38/* step op codes */
39enum {
40 CRUSH_RULE_NOOP = 0,
41 CRUSH_RULE_TAKE = 1, /* arg1 = value to start with */
42 CRUSH_RULE_CHOOSE_FIRSTN = 2, /* arg1 = num items to pick */
43 /* arg2 = type */
44 CRUSH_RULE_CHOOSE_INDEP = 3, /* same */
45 CRUSH_RULE_EMIT = 4, /* no args */
46 CRUSH_RULE_CHOOSE_LEAF_FIRSTN = 6,
47 CRUSH_RULE_CHOOSE_LEAF_INDEP = 7,
48};
49
50/*
51 * for specifying choose num (arg1) relative to the max parameter
52 * passed to do_rule
53 */
54#define CRUSH_CHOOSE_N 0
55#define CRUSH_CHOOSE_N_MINUS(x) (-(x))
56
57/*
58 * The rule mask is used to describe what the rule is intended for.
59 * Given a ruleset and size of output set, we search through the
60 * rule list for a matching rule_mask.
61 */
62struct crush_rule_mask {
63 __u8 ruleset;
64 __u8 type;
65 __u8 min_size;
66 __u8 max_size;
67};
68
69struct crush_rule {
70 __u32 len;
71 struct crush_rule_mask mask;
72 struct crush_rule_step steps[0];
73};
74
75#define crush_rule_size(len) (sizeof(struct crush_rule) + \
76 (len)*sizeof(struct crush_rule_step))
77
78
79
80/*
81 * A bucket is a named container of other items (either devices or
82 * other buckets). Items within a bucket are chosen using one of a
83 * few different algorithms. The table summarizes how the speed of
84 * each option measures up against mapping stability when items are
85 * added or removed.
86 *
87 * Bucket Alg Speed Additions Removals
88 * ------------------------------------------------
89 * uniform O(1) poor poor
90 * list O(n) optimal poor
91 * tree O(log n) good good
92 * straw O(n) optimal optimal
93 */
94enum {
95 CRUSH_BUCKET_UNIFORM = 1,
96 CRUSH_BUCKET_LIST = 2,
97 CRUSH_BUCKET_TREE = 3,
98 CRUSH_BUCKET_STRAW = 4
99};
100extern const char *crush_bucket_alg_name(int alg);
101
102struct crush_bucket {
103 __s32 id; /* this'll be negative */
104 __u16 type; /* non-zero; type=0 is reserved for devices */
105 __u8 alg; /* one of CRUSH_BUCKET_* */
106 __u8 hash; /* which hash function to use, CRUSH_HASH_* */
107 __u32 weight; /* 16-bit fixed point */
108 __u32 size; /* num items */
109 __s32 *items;
110
111 /*
112 * cached random permutation: used for uniform bucket and for
113 * the linear search fallback for the other bucket types.
114 */
115 __u32 perm_x; /* @x for which *perm is defined */
116 __u32 perm_n; /* num elements of *perm that are permuted/defined */
117 __u32 *perm;
118};
119
120struct crush_bucket_uniform {
121 struct crush_bucket h;
122 __u32 item_weight; /* 16-bit fixed point; all items equally weighted */
123};
124
125struct crush_bucket_list {
126 struct crush_bucket h;
127 __u32 *item_weights; /* 16-bit fixed point */
128 __u32 *sum_weights; /* 16-bit fixed point. element i is sum
129 of weights 0..i, inclusive */
130};
131
132struct crush_bucket_tree {
133 struct crush_bucket h; /* note: h.size is _tree_ size, not number of
134 actual items */
135 __u8 num_nodes;
136 __u32 *node_weights;
137};
138
139struct crush_bucket_straw {
140 struct crush_bucket h;
141 __u32 *item_weights; /* 16-bit fixed point */
142 __u32 *straws; /* 16-bit fixed point */
143};
144
145
146
147/*
148 * CRUSH map includes all buckets, rules, etc.
149 */
150struct crush_map {
151 struct crush_bucket **buckets;
152 struct crush_rule **rules;
153
154 /*
155 * Parent pointers to identify the parent bucket a device or
156 * bucket in the hierarchy. If an item appears more than
157 * once, this is the _last_ time it appeared (where buckets
158 * are processed in bucket id order, from -1 on down to
159 * -max_buckets.
160 */
161 __u32 *bucket_parents;
162 __u32 *device_parents;
163
164 __s32 max_buckets;
165 __u32 max_rules;
166 __s32 max_devices;
167};
168
169
170/* crush.c */
171extern int crush_get_bucket_item_weight(struct crush_bucket *b, int pos);
172extern void crush_calc_parents(struct crush_map *map);
173extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b);
174extern void crush_destroy_bucket_list(struct crush_bucket_list *b);
175extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b);
176extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b);
177extern void crush_destroy_bucket(struct crush_bucket *b);
178extern void crush_destroy(struct crush_map *map);
179
180#endif
diff --git a/fs/ceph/crush/hash.c b/fs/ceph/crush/hash.c
new file mode 100644
index 000000000000..5873aed694bf
--- /dev/null
+++ b/fs/ceph/crush/hash.c
@@ -0,0 +1,149 @@
1
2#include <linux/types.h>
3#include "hash.h"
4
5/*
6 * Robert Jenkins' function for mixing 32-bit values
7 * http://burtleburtle.net/bob/hash/evahash.html
8 * a, b = random bits, c = input and output
9 */
10#define crush_hashmix(a, b, c) do { \
11 a = a-b; a = a-c; a = a^(c>>13); \
12 b = b-c; b = b-a; b = b^(a<<8); \
13 c = c-a; c = c-b; c = c^(b>>13); \
14 a = a-b; a = a-c; a = a^(c>>12); \
15 b = b-c; b = b-a; b = b^(a<<16); \
16 c = c-a; c = c-b; c = c^(b>>5); \
17 a = a-b; a = a-c; a = a^(c>>3); \
18 b = b-c; b = b-a; b = b^(a<<10); \
19 c = c-a; c = c-b; c = c^(b>>15); \
20 } while (0)
21
22#define crush_hash_seed 1315423911
23
24static __u32 crush_hash32_rjenkins1(__u32 a)
25{
26 __u32 hash = crush_hash_seed ^ a;
27 __u32 b = a;
28 __u32 x = 231232;
29 __u32 y = 1232;
30 crush_hashmix(b, x, hash);
31 crush_hashmix(y, a, hash);
32 return hash;
33}
34
35static __u32 crush_hash32_rjenkins1_2(__u32 a, __u32 b)
36{
37 __u32 hash = crush_hash_seed ^ a ^ b;
38 __u32 x = 231232;
39 __u32 y = 1232;
40 crush_hashmix(a, b, hash);
41 crush_hashmix(x, a, hash);
42 crush_hashmix(b, y, hash);
43 return hash;
44}
45
46static __u32 crush_hash32_rjenkins1_3(__u32 a, __u32 b, __u32 c)
47{
48 __u32 hash = crush_hash_seed ^ a ^ b ^ c;
49 __u32 x = 231232;
50 __u32 y = 1232;
51 crush_hashmix(a, b, hash);
52 crush_hashmix(c, x, hash);
53 crush_hashmix(y, a, hash);
54 crush_hashmix(b, x, hash);
55 crush_hashmix(y, c, hash);
56 return hash;
57}
58
59static __u32 crush_hash32_rjenkins1_4(__u32 a, __u32 b, __u32 c, __u32 d)
60{
61 __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d;
62 __u32 x = 231232;
63 __u32 y = 1232;
64 crush_hashmix(a, b, hash);
65 crush_hashmix(c, d, hash);
66 crush_hashmix(a, x, hash);
67 crush_hashmix(y, b, hash);
68 crush_hashmix(c, x, hash);
69 crush_hashmix(y, d, hash);
70 return hash;
71}
72
73static __u32 crush_hash32_rjenkins1_5(__u32 a, __u32 b, __u32 c, __u32 d,
74 __u32 e)
75{
76 __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e;
77 __u32 x = 231232;
78 __u32 y = 1232;
79 crush_hashmix(a, b, hash);
80 crush_hashmix(c, d, hash);
81 crush_hashmix(e, x, hash);
82 crush_hashmix(y, a, hash);
83 crush_hashmix(b, x, hash);
84 crush_hashmix(y, c, hash);
85 crush_hashmix(d, x, hash);
86 crush_hashmix(y, e, hash);
87 return hash;
88}
89
90
91__u32 crush_hash32(int type, __u32 a)
92{
93 switch (type) {
94 case CRUSH_HASH_RJENKINS1:
95 return crush_hash32_rjenkins1(a);
96 default:
97 return 0;
98 }
99}
100
101__u32 crush_hash32_2(int type, __u32 a, __u32 b)
102{
103 switch (type) {
104 case CRUSH_HASH_RJENKINS1:
105 return crush_hash32_rjenkins1_2(a, b);
106 default:
107 return 0;
108 }
109}
110
111__u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c)
112{
113 switch (type) {
114 case CRUSH_HASH_RJENKINS1:
115 return crush_hash32_rjenkins1_3(a, b, c);
116 default:
117 return 0;
118 }
119}
120
121__u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d)
122{
123 switch (type) {
124 case CRUSH_HASH_RJENKINS1:
125 return crush_hash32_rjenkins1_4(a, b, c, d);
126 default:
127 return 0;
128 }
129}
130
131__u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, __u32 e)
132{
133 switch (type) {
134 case CRUSH_HASH_RJENKINS1:
135 return crush_hash32_rjenkins1_5(a, b, c, d, e);
136 default:
137 return 0;
138 }
139}
140
141const char *crush_hash_name(int type)
142{
143 switch (type) {
144 case CRUSH_HASH_RJENKINS1:
145 return "rjenkins1";
146 default:
147 return "unknown";
148 }
149}
diff --git a/fs/ceph/crush/hash.h b/fs/ceph/crush/hash.h
new file mode 100644
index 000000000000..ff48e110e4bb
--- /dev/null
+++ b/fs/ceph/crush/hash.h
@@ -0,0 +1,17 @@
1#ifndef _CRUSH_HASH_H
2#define _CRUSH_HASH_H
3
4#define CRUSH_HASH_RJENKINS1 0
5
6#define CRUSH_HASH_DEFAULT CRUSH_HASH_RJENKINS1
7
8extern const char *crush_hash_name(int type);
9
10extern __u32 crush_hash32(int type, __u32 a);
11extern __u32 crush_hash32_2(int type, __u32 a, __u32 b);
12extern __u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c);
13extern __u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d);
14extern __u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d,
15 __u32 e);
16
17#endif
diff --git a/fs/ceph/crush/mapper.c b/fs/ceph/crush/mapper.c
new file mode 100644
index 000000000000..9ba54efb6543
--- /dev/null
+++ b/fs/ceph/crush/mapper.c
@@ -0,0 +1,596 @@
1
2#ifdef __KERNEL__
3# include <linux/string.h>
4# include <linux/slab.h>
5# include <linux/bug.h>
6# include <linux/kernel.h>
7# ifndef dprintk
8# define dprintk(args...)
9# endif
10#else
11# include <string.h>
12# include <stdio.h>
13# include <stdlib.h>
14# include <assert.h>
15# define BUG_ON(x) assert(!(x))
16# define dprintk(args...) /* printf(args) */
17# define kmalloc(x, f) malloc(x)
18# define kfree(x) free(x)
19#endif
20
21#include "crush.h"
22#include "hash.h"
23
24/*
25 * Implement the core CRUSH mapping algorithm.
26 */
27
28/**
29 * crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
30 * @map: the crush_map
31 * @ruleset: the storage ruleset id (user defined)
32 * @type: storage ruleset type (user defined)
33 * @size: output set size
34 */
35int crush_find_rule(struct crush_map *map, int ruleset, int type, int size)
36{
37 int i;
38
39 for (i = 0; i < map->max_rules; i++) {
40 if (map->rules[i] &&
41 map->rules[i]->mask.ruleset == ruleset &&
42 map->rules[i]->mask.type == type &&
43 map->rules[i]->mask.min_size <= size &&
44 map->rules[i]->mask.max_size >= size)
45 return i;
46 }
47 return -1;
48}
49
50
51/*
52 * bucket choose methods
53 *
54 * For each bucket algorithm, we have a "choose" method that, given a
55 * crush input @x and replica position (usually, position in output set) @r,
56 * will produce an item in the bucket.
57 */
58
59/*
60 * Choose based on a random permutation of the bucket.
61 *
62 * We used to use some prime number arithmetic to do this, but it
63 * wasn't very random, and had some other bad behaviors. Instead, we
64 * calculate an actual random permutation of the bucket members.
65 * Since this is expensive, we optimize for the r=0 case, which
66 * captures the vast majority of calls.
67 */
68static int bucket_perm_choose(struct crush_bucket *bucket,
69 int x, int r)
70{
71 unsigned pr = r % bucket->size;
72 unsigned i, s;
73
74 /* start a new permutation if @x has changed */
75 if (bucket->perm_x != x || bucket->perm_n == 0) {
76 dprintk("bucket %d new x=%d\n", bucket->id, x);
77 bucket->perm_x = x;
78
79 /* optimize common r=0 case */
80 if (pr == 0) {
81 s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
82 bucket->size;
83 bucket->perm[0] = s;
84 bucket->perm_n = 0xffff; /* magic value, see below */
85 goto out;
86 }
87
88 for (i = 0; i < bucket->size; i++)
89 bucket->perm[i] = i;
90 bucket->perm_n = 0;
91 } else if (bucket->perm_n == 0xffff) {
92 /* clean up after the r=0 case above */
93 for (i = 1; i < bucket->size; i++)
94 bucket->perm[i] = i;
95 bucket->perm[bucket->perm[0]] = 0;
96 bucket->perm_n = 1;
97 }
98
99 /* calculate permutation up to pr */
100 for (i = 0; i < bucket->perm_n; i++)
101 dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
102 while (bucket->perm_n <= pr) {
103 unsigned p = bucket->perm_n;
104 /* no point in swapping the final entry */
105 if (p < bucket->size - 1) {
106 i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
107 (bucket->size - p);
108 if (i) {
109 unsigned t = bucket->perm[p + i];
110 bucket->perm[p + i] = bucket->perm[p];
111 bucket->perm[p] = t;
112 }
113 dprintk(" perm_choose swap %d with %d\n", p, p+i);
114 }
115 bucket->perm_n++;
116 }
117 for (i = 0; i < bucket->size; i++)
118 dprintk(" perm_choose %d: %d\n", i, bucket->perm[i]);
119
120 s = bucket->perm[pr];
121out:
122 dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
123 bucket->size, x, r, pr, s);
124 return bucket->items[s];
125}
126
127/* uniform */
128static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
129 int x, int r)
130{
131 return bucket_perm_choose(&bucket->h, x, r);
132}
133
134/* list */
135static int bucket_list_choose(struct crush_bucket_list *bucket,
136 int x, int r)
137{
138 int i;
139
140 for (i = bucket->h.size-1; i >= 0; i--) {
141 __u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i],
142 r, bucket->h.id);
143 w &= 0xffff;
144 dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
145 "sw %x rand %llx",
146 i, x, r, bucket->h.items[i], bucket->item_weights[i],
147 bucket->sum_weights[i], w);
148 w *= bucket->sum_weights[i];
149 w = w >> 16;
150 /*dprintk(" scaled %llx\n", w);*/
151 if (w < bucket->item_weights[i])
152 return bucket->h.items[i];
153 }
154
155 BUG_ON(1);
156 return 0;
157}
158
159
160/* (binary) tree */
161static int height(int n)
162{
163 int h = 0;
164 while ((n & 1) == 0) {
165 h++;
166 n = n >> 1;
167 }
168 return h;
169}
170
171static int left(int x)
172{
173 int h = height(x);
174 return x - (1 << (h-1));
175}
176
177static int right(int x)
178{
179 int h = height(x);
180 return x + (1 << (h-1));
181}
182
183static int terminal(int x)
184{
185 return x & 1;
186}
187
188static int bucket_tree_choose(struct crush_bucket_tree *bucket,
189 int x, int r)
190{
191 int n, l;
192 __u32 w;
193 __u64 t;
194
195 /* start at root */
196 n = bucket->num_nodes >> 1;
197
198 while (!terminal(n)) {
199 /* pick point in [0, w) */
200 w = bucket->node_weights[n];
201 t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
202 bucket->h.id) * (__u64)w;
203 t = t >> 32;
204
205 /* descend to the left or right? */
206 l = left(n);
207 if (t < bucket->node_weights[l])
208 n = l;
209 else
210 n = right(n);
211 }
212
213 return bucket->h.items[n >> 1];
214}
215
216
217/* straw */
218
219static int bucket_straw_choose(struct crush_bucket_straw *bucket,
220 int x, int r)
221{
222 int i;
223 int high = 0;
224 __u64 high_draw = 0;
225 __u64 draw;
226
227 for (i = 0; i < bucket->h.size; i++) {
228 draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r);
229 draw &= 0xffff;
230 draw *= bucket->straws[i];
231 if (i == 0 || draw > high_draw) {
232 high = i;
233 high_draw = draw;
234 }
235 }
236 return bucket->h.items[high];
237}
238
239static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
240{
241 dprintk("choose %d x=%d r=%d\n", in->id, x, r);
242 switch (in->alg) {
243 case CRUSH_BUCKET_UNIFORM:
244 return bucket_uniform_choose((struct crush_bucket_uniform *)in,
245 x, r);
246 case CRUSH_BUCKET_LIST:
247 return bucket_list_choose((struct crush_bucket_list *)in,
248 x, r);
249 case CRUSH_BUCKET_TREE:
250 return bucket_tree_choose((struct crush_bucket_tree *)in,
251 x, r);
252 case CRUSH_BUCKET_STRAW:
253 return bucket_straw_choose((struct crush_bucket_straw *)in,
254 x, r);
255 default:
256 BUG_ON(1);
257 return in->items[0];
258 }
259}
260
261/*
262 * true if device is marked "out" (failed, fully offloaded)
263 * of the cluster
264 */
265static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
266{
267 if (weight[item] >= 0x1000)
268 return 0;
269 if (weight[item] == 0)
270 return 1;
271 if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff)
272 < weight[item])
273 return 0;
274 return 1;
275}
276
277/**
278 * crush_choose - choose numrep distinct items of given type
279 * @map: the crush_map
280 * @bucket: the bucket we are choose an item from
281 * @x: crush input value
282 * @numrep: the number of items to choose
283 * @type: the type of item to choose
284 * @out: pointer to output vector
285 * @outpos: our position in that vector
286 * @firstn: true if choosing "first n" items, false if choosing "indep"
287 * @recurse_to_leaf: true if we want one device under each item of given type
288 * @out2: second output vector for leaf items (if @recurse_to_leaf)
289 */
290static int crush_choose(struct crush_map *map,
291 struct crush_bucket *bucket,
292 __u32 *weight,
293 int x, int numrep, int type,
294 int *out, int outpos,
295 int firstn, int recurse_to_leaf,
296 int *out2)
297{
298 int rep;
299 int ftotal, flocal;
300 int retry_descent, retry_bucket, skip_rep;
301 struct crush_bucket *in = bucket;
302 int r;
303 int i;
304 int item = 0;
305 int itemtype;
306 int collide, reject;
307 const int orig_tries = 5; /* attempts before we fall back to search */
308 dprintk("choose bucket %d x %d outpos %d\n", bucket->id, x, outpos);
309
310 for (rep = outpos; rep < numrep; rep++) {
311 /* keep trying until we get a non-out, non-colliding item */
312 ftotal = 0;
313 skip_rep = 0;
314 do {
315 retry_descent = 0;
316 in = bucket; /* initial bucket */
317
318 /* choose through intervening buckets */
319 flocal = 0;
320 do {
321 collide = 0;
322 retry_bucket = 0;
323 r = rep;
324 if (in->alg == CRUSH_BUCKET_UNIFORM) {
325 /* be careful */
326 if (firstn || numrep >= in->size)
327 /* r' = r + f_total */
328 r += ftotal;
329 else if (in->size % numrep == 0)
330 /* r'=r+(n+1)*f_local */
331 r += (numrep+1) *
332 (flocal+ftotal);
333 else
334 /* r' = r + n*f_local */
335 r += numrep * (flocal+ftotal);
336 } else {
337 if (firstn)
338 /* r' = r + f_total */
339 r += ftotal;
340 else
341 /* r' = r + n*f_local */
342 r += numrep * (flocal+ftotal);
343 }
344
345 /* bucket choose */
346 if (in->size == 0) {
347 reject = 1;
348 goto reject;
349 }
350 if (flocal >= (in->size>>1) &&
351 flocal > orig_tries)
352 item = bucket_perm_choose(in, x, r);
353 else
354 item = crush_bucket_choose(in, x, r);
355 BUG_ON(item >= map->max_devices);
356
357 /* desired type? */
358 if (item < 0)
359 itemtype = map->buckets[-1-item]->type;
360 else
361 itemtype = 0;
362 dprintk(" item %d type %d\n", item, itemtype);
363
364 /* keep going? */
365 if (itemtype != type) {
366 BUG_ON(item >= 0 ||
367 (-1-item) >= map->max_buckets);
368 in = map->buckets[-1-item];
369 continue;
370 }
371
372 /* collision? */
373 for (i = 0; i < outpos; i++) {
374 if (out[i] == item) {
375 collide = 1;
376 break;
377 }
378 }
379
380 if (recurse_to_leaf &&
381 item < 0 &&
382 crush_choose(map, map->buckets[-1-item],
383 weight,
384 x, outpos+1, 0,
385 out2, outpos,
386 firstn, 0, NULL) <= outpos) {
387 reject = 1;
388 } else {
389 /* out? */
390 if (itemtype == 0)
391 reject = is_out(map, weight,
392 item, x);
393 else
394 reject = 0;
395 }
396
397reject:
398 if (reject || collide) {
399 ftotal++;
400 flocal++;
401
402 if (collide && flocal < 3)
403 /* retry locally a few times */
404 retry_bucket = 1;
405 else if (flocal < in->size + orig_tries)
406 /* exhaustive bucket search */
407 retry_bucket = 1;
408 else if (ftotal < 20)
409 /* then retry descent */
410 retry_descent = 1;
411 else
412 /* else give up */
413 skip_rep = 1;
414 dprintk(" reject %d collide %d "
415 "ftotal %d flocal %d\n",
416 reject, collide, ftotal,
417 flocal);
418 }
419 } while (retry_bucket);
420 } while (retry_descent);
421
422 if (skip_rep) {
423 dprintk("skip rep\n");
424 continue;
425 }
426
427 dprintk("choose got %d\n", item);
428 out[outpos] = item;
429 outpos++;
430 }
431
432 dprintk("choose returns %d\n", outpos);
433 return outpos;
434}
435
436
437/**
438 * crush_do_rule - calculate a mapping with the given input and rule
439 * @map: the crush_map
440 * @ruleno: the rule id
441 * @x: hash input
442 * @result: pointer to result vector
443 * @result_max: maximum result size
444 * @force: force initial replica choice; -1 for none
445 */
446int crush_do_rule(struct crush_map *map,
447 int ruleno, int x, int *result, int result_max,
448 int force, __u32 *weight)
449{
450 int result_len;
451 int force_context[CRUSH_MAX_DEPTH];
452 int force_pos = -1;
453 int a[CRUSH_MAX_SET];
454 int b[CRUSH_MAX_SET];
455 int c[CRUSH_MAX_SET];
456 int recurse_to_leaf;
457 int *w;
458 int wsize = 0;
459 int *o;
460 int osize;
461 int *tmp;
462 struct crush_rule *rule;
463 int step;
464 int i, j;
465 int numrep;
466 int firstn;
467 int rc = -1;
468
469 BUG_ON(ruleno >= map->max_rules);
470
471 rule = map->rules[ruleno];
472 result_len = 0;
473 w = a;
474 o = b;
475
476 /*
477 * determine hierarchical context of force, if any. note
478 * that this may or may not correspond to the specific types
479 * referenced by the crush rule.
480 */
481 if (force >= 0) {
482 if (force >= map->max_devices ||
483 map->device_parents[force] == 0) {
484 /*dprintk("CRUSH: forcefed device dne\n");*/
485 rc = -1; /* force fed device dne */
486 goto out;
487 }
488 if (!is_out(map, weight, force, x)) {
489 while (1) {
490 force_context[++force_pos] = force;
491 if (force >= 0)
492 force = map->device_parents[force];
493 else
494 force = map->bucket_parents[-1-force];
495 if (force == 0)
496 break;
497 }
498 }
499 }
500
501 for (step = 0; step < rule->len; step++) {
502 firstn = 0;
503 switch (rule->steps[step].op) {
504 case CRUSH_RULE_TAKE:
505 w[0] = rule->steps[step].arg1;
506 if (force_pos >= 0) {
507 BUG_ON(force_context[force_pos] != w[0]);
508 force_pos--;
509 }
510 wsize = 1;
511 break;
512
513 case CRUSH_RULE_CHOOSE_LEAF_FIRSTN:
514 case CRUSH_RULE_CHOOSE_FIRSTN:
515 firstn = 1;
516 case CRUSH_RULE_CHOOSE_LEAF_INDEP:
517 case CRUSH_RULE_CHOOSE_INDEP:
518 BUG_ON(wsize == 0);
519
520 recurse_to_leaf =
521 rule->steps[step].op ==
522 CRUSH_RULE_CHOOSE_LEAF_FIRSTN ||
523 rule->steps[step].op ==
524 CRUSH_RULE_CHOOSE_LEAF_INDEP;
525
526 /* reset output */
527 osize = 0;
528
529 for (i = 0; i < wsize; i++) {
530 /*
531 * see CRUSH_N, CRUSH_N_MINUS macros.
532 * basically, numrep <= 0 means relative to
533 * the provided result_max
534 */
535 numrep = rule->steps[step].arg1;
536 if (numrep <= 0) {
537 numrep += result_max;
538 if (numrep <= 0)
539 continue;
540 }
541 j = 0;
542 if (osize == 0 && force_pos >= 0) {
543 /* skip any intermediate types */
544 while (force_pos &&
545 force_context[force_pos] < 0 &&
546 rule->steps[step].arg2 !=
547 map->buckets[-1 -
548 force_context[force_pos]]->type)
549 force_pos--;
550 o[osize] = force_context[force_pos];
551 if (recurse_to_leaf)
552 c[osize] = force_context[0];
553 j++;
554 force_pos--;
555 }
556 osize += crush_choose(map,
557 map->buckets[-1-w[i]],
558 weight,
559 x, numrep,
560 rule->steps[step].arg2,
561 o+osize, j,
562 firstn,
563 recurse_to_leaf, c+osize);
564 }
565
566 if (recurse_to_leaf)
567 /* copy final _leaf_ values to output set */
568 memcpy(o, c, osize*sizeof(*o));
569
570 /* swap t and w arrays */
571 tmp = o;
572 o = w;
573 w = tmp;
574 wsize = osize;
575 break;
576
577
578 case CRUSH_RULE_EMIT:
579 for (i = 0; i < wsize && result_len < result_max; i++) {
580 result[result_len] = w[i];
581 result_len++;
582 }
583 wsize = 0;
584 break;
585
586 default:
587 BUG_ON(1);
588 }
589 }
590 rc = result_len;
591
592out:
593 return rc;
594}
595
596
diff --git a/fs/ceph/crush/mapper.h b/fs/ceph/crush/mapper.h
new file mode 100644
index 000000000000..98e90046fd9f
--- /dev/null
+++ b/fs/ceph/crush/mapper.h
@@ -0,0 +1,20 @@
1#ifndef _CRUSH_MAPPER_H
2#define _CRUSH_MAPPER_H
3
4/*
5 * CRUSH functions for find rules and then mapping an input to an
6 * output set.
7 *
8 * LGPL2
9 */
10
11#include "crush.h"
12
13extern int crush_find_rule(struct crush_map *map, int pool, int type, int size);
14extern int crush_do_rule(struct crush_map *map,
15 int ruleno,
16 int x, int *result, int result_max,
17 int forcefeed, /* -1 for none */
18 __u32 *weights);
19
20#endif
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c
new file mode 100644
index 000000000000..291ac288e791
--- /dev/null
+++ b/fs/ceph/crypto.c
@@ -0,0 +1,408 @@
1
2#include "ceph_debug.h"
3
4#include <linux/err.h>
5#include <linux/scatterlist.h>
6#include <crypto/hash.h>
7
8#include "crypto.h"
9#include "decode.h"
10
11int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end)
12{
13 if (*p + sizeof(u16) + sizeof(key->created) +
14 sizeof(u16) + key->len > end)
15 return -ERANGE;
16 ceph_encode_16(p, key->type);
17 ceph_encode_copy(p, &key->created, sizeof(key->created));
18 ceph_encode_16(p, key->len);
19 ceph_encode_copy(p, key->key, key->len);
20 return 0;
21}
22
23int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end)
24{
25 ceph_decode_need(p, end, 2*sizeof(u16) + sizeof(key->created), bad);
26 key->type = ceph_decode_16(p);
27 ceph_decode_copy(p, &key->created, sizeof(key->created));
28 key->len = ceph_decode_16(p);
29 ceph_decode_need(p, end, key->len, bad);
30 key->key = kmalloc(key->len, GFP_NOFS);
31 if (!key->key)
32 return -ENOMEM;
33 ceph_decode_copy(p, key->key, key->len);
34 return 0;
35
36bad:
37 dout("failed to decode crypto key\n");
38 return -EINVAL;
39}
40
41int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey)
42{
43 int inlen = strlen(inkey);
44 int blen = inlen * 3 / 4;
45 void *buf, *p;
46 int ret;
47
48 dout("crypto_key_unarmor %s\n", inkey);
49 buf = kmalloc(blen, GFP_NOFS);
50 if (!buf)
51 return -ENOMEM;
52 blen = ceph_unarmor(buf, inkey, inkey+inlen);
53 if (blen < 0) {
54 kfree(buf);
55 return blen;
56 }
57
58 p = buf;
59 ret = ceph_crypto_key_decode(key, &p, p + blen);
60 kfree(buf);
61 if (ret)
62 return ret;
63 dout("crypto_key_unarmor key %p type %d len %d\n", key,
64 key->type, key->len);
65 return 0;
66}
67
68
69
70#define AES_KEY_SIZE 16
71
72static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void)
73{
74 return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
75}
76
77const u8 *aes_iv = "cephsageyudagreg";
78
79int ceph_aes_encrypt(const void *key, int key_len, void *dst, size_t *dst_len,
80 const void *src, size_t src_len)
81{
82 struct scatterlist sg_in[2], sg_out[1];
83 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
84 struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
85 int ret;
86 void *iv;
87 int ivsize;
88 size_t zero_padding = (0x10 - (src_len & 0x0f));
89 char pad[16];
90
91 if (IS_ERR(tfm))
92 return PTR_ERR(tfm);
93
94 memset(pad, zero_padding, zero_padding);
95
96 *dst_len = src_len + zero_padding;
97
98 crypto_blkcipher_setkey((void *)tfm, key, key_len);
99 sg_init_table(sg_in, 2);
100 sg_set_buf(&sg_in[0], src, src_len);
101 sg_set_buf(&sg_in[1], pad, zero_padding);
102 sg_init_table(sg_out, 1);
103 sg_set_buf(sg_out, dst, *dst_len);
104 iv = crypto_blkcipher_crt(tfm)->iv;
105 ivsize = crypto_blkcipher_ivsize(tfm);
106
107 memcpy(iv, aes_iv, ivsize);
108 /*
109 print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
110 key, key_len, 1);
111 print_hex_dump(KERN_ERR, "enc src: ", DUMP_PREFIX_NONE, 16, 1,
112 src, src_len, 1);
113 print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
114 pad, zero_padding, 1);
115 */
116 ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
117 src_len + zero_padding);
118 crypto_free_blkcipher(tfm);
119 if (ret < 0)
120 pr_err("ceph_aes_crypt failed %d\n", ret);
121 /*
122 print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
123 dst, *dst_len, 1);
124 */
125 return 0;
126}
127
128int ceph_aes_encrypt2(const void *key, int key_len, void *dst, size_t *dst_len,
129 const void *src1, size_t src1_len,
130 const void *src2, size_t src2_len)
131{
132 struct scatterlist sg_in[3], sg_out[1];
133 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
134 struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
135 int ret;
136 void *iv;
137 int ivsize;
138 size_t zero_padding = (0x10 - ((src1_len + src2_len) & 0x0f));
139 char pad[16];
140
141 if (IS_ERR(tfm))
142 return PTR_ERR(tfm);
143
144 memset(pad, zero_padding, zero_padding);
145
146 *dst_len = src1_len + src2_len + zero_padding;
147
148 crypto_blkcipher_setkey((void *)tfm, key, key_len);
149 sg_init_table(sg_in, 3);
150 sg_set_buf(&sg_in[0], src1, src1_len);
151 sg_set_buf(&sg_in[1], src2, src2_len);
152 sg_set_buf(&sg_in[2], pad, zero_padding);
153 sg_init_table(sg_out, 1);
154 sg_set_buf(sg_out, dst, *dst_len);
155 iv = crypto_blkcipher_crt(tfm)->iv;
156 ivsize = crypto_blkcipher_ivsize(tfm);
157
158 memcpy(iv, aes_iv, ivsize);
159 /*
160 print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
161 key, key_len, 1);
162 print_hex_dump(KERN_ERR, "enc src1: ", DUMP_PREFIX_NONE, 16, 1,
163 src1, src1_len, 1);
164 print_hex_dump(KERN_ERR, "enc src2: ", DUMP_PREFIX_NONE, 16, 1,
165 src2, src2_len, 1);
166 print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
167 pad, zero_padding, 1);
168 */
169 ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
170 src1_len + src2_len + zero_padding);
171 crypto_free_blkcipher(tfm);
172 if (ret < 0)
173 pr_err("ceph_aes_crypt2 failed %d\n", ret);
174 /*
175 print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
176 dst, *dst_len, 1);
177 */
178 return 0;
179}
180
181int ceph_aes_decrypt(const void *key, int key_len, void *dst, size_t *dst_len,
182 const void *src, size_t src_len)
183{
184 struct scatterlist sg_in[1], sg_out[2];
185 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
186 struct blkcipher_desc desc = { .tfm = tfm };
187 char pad[16];
188 void *iv;
189 int ivsize;
190 int ret;
191 int last_byte;
192
193 if (IS_ERR(tfm))
194 return PTR_ERR(tfm);
195
196 crypto_blkcipher_setkey((void *)tfm, key, key_len);
197 sg_init_table(sg_in, 1);
198 sg_init_table(sg_out, 2);
199 sg_set_buf(sg_in, src, src_len);
200 sg_set_buf(&sg_out[0], dst, *dst_len);
201 sg_set_buf(&sg_out[1], pad, sizeof(pad));
202
203 iv = crypto_blkcipher_crt(tfm)->iv;
204 ivsize = crypto_blkcipher_ivsize(tfm);
205
206 memcpy(iv, aes_iv, ivsize);
207
208 /*
209 print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
210 key, key_len, 1);
211 print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1,
212 src, src_len, 1);
213 */
214
215 ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
216 crypto_free_blkcipher(tfm);
217 if (ret < 0) {
218 pr_err("ceph_aes_decrypt failed %d\n", ret);
219 return ret;
220 }
221
222 if (src_len <= *dst_len)
223 last_byte = ((char *)dst)[src_len - 1];
224 else
225 last_byte = pad[src_len - *dst_len - 1];
226 if (last_byte <= 16 && src_len >= last_byte) {
227 *dst_len = src_len - last_byte;
228 } else {
229 pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
230 last_byte, (int)src_len);
231 return -EPERM; /* bad padding */
232 }
233 /*
234 print_hex_dump(KERN_ERR, "dec out: ", DUMP_PREFIX_NONE, 16, 1,
235 dst, *dst_len, 1);
236 */
237 return 0;
238}
239
240int ceph_aes_decrypt2(const void *key, int key_len,
241 void *dst1, size_t *dst1_len,
242 void *dst2, size_t *dst2_len,
243 const void *src, size_t src_len)
244{
245 struct scatterlist sg_in[1], sg_out[3];
246 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
247 struct blkcipher_desc desc = { .tfm = tfm };
248 char pad[16];
249 void *iv;
250 int ivsize;
251 int ret;
252 int last_byte;
253
254 if (IS_ERR(tfm))
255 return PTR_ERR(tfm);
256
257 sg_init_table(sg_in, 1);
258 sg_set_buf(sg_in, src, src_len);
259 sg_init_table(sg_out, 3);
260 sg_set_buf(&sg_out[0], dst1, *dst1_len);
261 sg_set_buf(&sg_out[1], dst2, *dst2_len);
262 sg_set_buf(&sg_out[2], pad, sizeof(pad));
263
264 crypto_blkcipher_setkey((void *)tfm, key, key_len);
265 iv = crypto_blkcipher_crt(tfm)->iv;
266 ivsize = crypto_blkcipher_ivsize(tfm);
267
268 memcpy(iv, aes_iv, ivsize);
269
270 /*
271 print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
272 key, key_len, 1);
273 print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1,
274 src, src_len, 1);
275 */
276
277 ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
278 crypto_free_blkcipher(tfm);
279 if (ret < 0) {
280 pr_err("ceph_aes_decrypt failed %d\n", ret);
281 return ret;
282 }
283
284 if (src_len <= *dst1_len)
285 last_byte = ((char *)dst1)[src_len - 1];
286 else if (src_len <= *dst1_len + *dst2_len)
287 last_byte = ((char *)dst2)[src_len - *dst1_len - 1];
288 else
289 last_byte = pad[src_len - *dst1_len - *dst2_len - 1];
290 if (last_byte <= 16 && src_len >= last_byte) {
291 src_len -= last_byte;
292 } else {
293 pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
294 last_byte, (int)src_len);
295 return -EPERM; /* bad padding */
296 }
297
298 if (src_len < *dst1_len) {
299 *dst1_len = src_len;
300 *dst2_len = 0;
301 } else {
302 *dst2_len = src_len - *dst1_len;
303 }
304 /*
305 print_hex_dump(KERN_ERR, "dec out1: ", DUMP_PREFIX_NONE, 16, 1,
306 dst1, *dst1_len, 1);
307 print_hex_dump(KERN_ERR, "dec out2: ", DUMP_PREFIX_NONE, 16, 1,
308 dst2, *dst2_len, 1);
309 */
310
311 return 0;
312}
313
314
315int ceph_decrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
316 const void *src, size_t src_len)
317{
318 switch (secret->type) {
319 case CEPH_CRYPTO_NONE:
320 if (*dst_len < src_len)
321 return -ERANGE;
322 memcpy(dst, src, src_len);
323 *dst_len = src_len;
324 return 0;
325
326 case CEPH_CRYPTO_AES:
327 return ceph_aes_decrypt(secret->key, secret->len, dst,
328 dst_len, src, src_len);
329
330 default:
331 return -EINVAL;
332 }
333}
334
335int ceph_decrypt2(struct ceph_crypto_key *secret,
336 void *dst1, size_t *dst1_len,
337 void *dst2, size_t *dst2_len,
338 const void *src, size_t src_len)
339{
340 size_t t;
341
342 switch (secret->type) {
343 case CEPH_CRYPTO_NONE:
344 if (*dst1_len + *dst2_len < src_len)
345 return -ERANGE;
346 t = min(*dst1_len, src_len);
347 memcpy(dst1, src, t);
348 *dst1_len = t;
349 src += t;
350 src_len -= t;
351 if (src_len) {
352 t = min(*dst2_len, src_len);
353 memcpy(dst2, src, t);
354 *dst2_len = t;
355 }
356 return 0;
357
358 case CEPH_CRYPTO_AES:
359 return ceph_aes_decrypt2(secret->key, secret->len,
360 dst1, dst1_len, dst2, dst2_len,
361 src, src_len);
362
363 default:
364 return -EINVAL;
365 }
366}
367
368int ceph_encrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
369 const void *src, size_t src_len)
370{
371 switch (secret->type) {
372 case CEPH_CRYPTO_NONE:
373 if (*dst_len < src_len)
374 return -ERANGE;
375 memcpy(dst, src, src_len);
376 *dst_len = src_len;
377 return 0;
378
379 case CEPH_CRYPTO_AES:
380 return ceph_aes_encrypt(secret->key, secret->len, dst,
381 dst_len, src, src_len);
382
383 default:
384 return -EINVAL;
385 }
386}
387
388int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
389 const void *src1, size_t src1_len,
390 const void *src2, size_t src2_len)
391{
392 switch (secret->type) {
393 case CEPH_CRYPTO_NONE:
394 if (*dst_len < src1_len + src2_len)
395 return -ERANGE;
396 memcpy(dst, src1, src1_len);
397 memcpy(dst + src1_len, src2, src2_len);
398 *dst_len = src1_len + src2_len;
399 return 0;
400
401 case CEPH_CRYPTO_AES:
402 return ceph_aes_encrypt2(secret->key, secret->len, dst, dst_len,
403 src1, src1_len, src2, src2_len);
404
405 default:
406 return -EINVAL;
407 }
408}
diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h
new file mode 100644
index 000000000000..40b502e6bd89
--- /dev/null
+++ b/fs/ceph/crypto.h
@@ -0,0 +1,48 @@
1#ifndef _FS_CEPH_CRYPTO_H
2#define _FS_CEPH_CRYPTO_H
3
4#include "types.h"
5#include "buffer.h"
6
7/*
8 * cryptographic secret
9 */
10struct ceph_crypto_key {
11 int type;
12 struct ceph_timespec created;
13 int len;
14 void *key;
15};
16
17static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
18{
19 kfree(key->key);
20}
21
22extern int ceph_crypto_key_encode(struct ceph_crypto_key *key,
23 void **p, void *end);
24extern int ceph_crypto_key_decode(struct ceph_crypto_key *key,
25 void **p, void *end);
26extern int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *in);
27
28/* crypto.c */
29extern int ceph_decrypt(struct ceph_crypto_key *secret,
30 void *dst, size_t *dst_len,
31 const void *src, size_t src_len);
32extern int ceph_encrypt(struct ceph_crypto_key *secret,
33 void *dst, size_t *dst_len,
34 const void *src, size_t src_len);
35extern int ceph_decrypt2(struct ceph_crypto_key *secret,
36 void *dst1, size_t *dst1_len,
37 void *dst2, size_t *dst2_len,
38 const void *src, size_t src_len);
39extern int ceph_encrypt2(struct ceph_crypto_key *secret,
40 void *dst, size_t *dst_len,
41 const void *src1, size_t src1_len,
42 const void *src2, size_t src2_len);
43
44/* armor.c */
45extern int ceph_armor(char *dst, const void *src, const void *end);
46extern int ceph_unarmor(void *dst, const char *src, const char *end);
47
48#endif
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
new file mode 100644
index 000000000000..e159f1415110
--- /dev/null
+++ b/fs/ceph/debugfs.c
@@ -0,0 +1,483 @@
1#include "ceph_debug.h"
2
3#include <linux/device.h>
4#include <linux/module.h>
5#include <linux/ctype.h>
6#include <linux/debugfs.h>
7#include <linux/seq_file.h>
8
9#include "super.h"
10#include "mds_client.h"
11#include "mon_client.h"
12#include "auth.h"
13
14#ifdef CONFIG_DEBUG_FS
15
16/*
17 * Implement /sys/kernel/debug/ceph fun
18 *
19 * /sys/kernel/debug/ceph/client* - an instance of the ceph client
20 * .../osdmap - current osdmap
21 * .../mdsmap - current mdsmap
22 * .../monmap - current monmap
23 * .../osdc - active osd requests
24 * .../mdsc - active mds requests
25 * .../monc - mon client state
26 * .../dentry_lru - dump contents of dentry lru
27 * .../caps - expose cap (reservation) stats
28 * .../bdi - symlink to ../../bdi/something
29 */
30
31static struct dentry *ceph_debugfs_dir;
32
33static int monmap_show(struct seq_file *s, void *p)
34{
35 int i;
36 struct ceph_client *client = s->private;
37
38 if (client->monc.monmap == NULL)
39 return 0;
40
41 seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
42 for (i = 0; i < client->monc.monmap->num_mon; i++) {
43 struct ceph_entity_inst *inst =
44 &client->monc.monmap->mon_inst[i];
45
46 seq_printf(s, "\t%s%lld\t%s\n",
47 ENTITY_NAME(inst->name),
48 pr_addr(&inst->addr.in_addr));
49 }
50 return 0;
51}
52
53static int mdsmap_show(struct seq_file *s, void *p)
54{
55 int i;
56 struct ceph_client *client = s->private;
57
58 if (client->mdsc.mdsmap == NULL)
59 return 0;
60 seq_printf(s, "epoch %d\n", client->mdsc.mdsmap->m_epoch);
61 seq_printf(s, "root %d\n", client->mdsc.mdsmap->m_root);
62 seq_printf(s, "session_timeout %d\n",
63 client->mdsc.mdsmap->m_session_timeout);
64 seq_printf(s, "session_autoclose %d\n",
65 client->mdsc.mdsmap->m_session_autoclose);
66 for (i = 0; i < client->mdsc.mdsmap->m_max_mds; i++) {
67 struct ceph_entity_addr *addr =
68 &client->mdsc.mdsmap->m_info[i].addr;
69 int state = client->mdsc.mdsmap->m_info[i].state;
70
71 seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, pr_addr(&addr->in_addr),
72 ceph_mds_state_name(state));
73 }
74 return 0;
75}
76
77static int osdmap_show(struct seq_file *s, void *p)
78{
79 int i;
80 struct ceph_client *client = s->private;
81 struct rb_node *n;
82
83 if (client->osdc.osdmap == NULL)
84 return 0;
85 seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch);
86 seq_printf(s, "flags%s%s\n",
87 (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ?
88 " NEARFULL" : "",
89 (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
90 " FULL" : "");
91 for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
92 struct ceph_pg_pool_info *pool =
93 rb_entry(n, struct ceph_pg_pool_info, node);
94 seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n",
95 pool->id, pool->v.pg_num, pool->pg_num_mask,
96 pool->v.lpg_num, pool->lpg_num_mask);
97 }
98 for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
99 struct ceph_entity_addr *addr =
100 &client->osdc.osdmap->osd_addr[i];
101 int state = client->osdc.osdmap->osd_state[i];
102 char sb[64];
103
104 seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n",
105 i, pr_addr(&addr->in_addr),
106 ((client->osdc.osdmap->osd_weight[i]*100) >> 16),
107 ceph_osdmap_state_str(sb, sizeof(sb), state));
108 }
109 return 0;
110}
111
112static int monc_show(struct seq_file *s, void *p)
113{
114 struct ceph_client *client = s->private;
115 struct ceph_mon_statfs_request *req;
116 struct ceph_mon_client *monc = &client->monc;
117 struct rb_node *rp;
118
119 mutex_lock(&monc->mutex);
120
121 if (monc->have_mdsmap)
122 seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap);
123 if (monc->have_osdmap)
124 seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap);
125 if (monc->want_next_osdmap)
126 seq_printf(s, "want next osdmap\n");
127
128 for (rp = rb_first(&monc->statfs_request_tree); rp; rp = rb_next(rp)) {
129 req = rb_entry(rp, struct ceph_mon_statfs_request, node);
130 seq_printf(s, "%lld statfs\n", req->tid);
131 }
132
133 mutex_unlock(&monc->mutex);
134 return 0;
135}
136
137static int mdsc_show(struct seq_file *s, void *p)
138{
139 struct ceph_client *client = s->private;
140 struct ceph_mds_client *mdsc = &client->mdsc;
141 struct ceph_mds_request *req;
142 struct rb_node *rp;
143 int pathlen;
144 u64 pathbase;
145 char *path;
146
147 mutex_lock(&mdsc->mutex);
148 for (rp = rb_first(&mdsc->request_tree); rp; rp = rb_next(rp)) {
149 req = rb_entry(rp, struct ceph_mds_request, r_node);
150
151 if (req->r_request)
152 seq_printf(s, "%lld\tmds%d\t", req->r_tid, req->r_mds);
153 else
154 seq_printf(s, "%lld\t(no request)\t", req->r_tid);
155
156 seq_printf(s, "%s", ceph_mds_op_name(req->r_op));
157
158 if (req->r_got_unsafe)
159 seq_printf(s, "\t(unsafe)");
160 else
161 seq_printf(s, "\t");
162
163 if (req->r_inode) {
164 seq_printf(s, " #%llx", ceph_ino(req->r_inode));
165 } else if (req->r_dentry) {
166 path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
167 &pathbase, 0);
168 spin_lock(&req->r_dentry->d_lock);
169 seq_printf(s, " #%llx/%.*s (%s)",
170 ceph_ino(req->r_dentry->d_parent->d_inode),
171 req->r_dentry->d_name.len,
172 req->r_dentry->d_name.name,
173 path ? path : "");
174 spin_unlock(&req->r_dentry->d_lock);
175 kfree(path);
176 } else if (req->r_path1) {
177 seq_printf(s, " #%llx/%s", req->r_ino1.ino,
178 req->r_path1);
179 }
180
181 if (req->r_old_dentry) {
182 path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen,
183 &pathbase, 0);
184 spin_lock(&req->r_old_dentry->d_lock);
185 seq_printf(s, " #%llx/%.*s (%s)",
186 ceph_ino(req->r_old_dentry->d_parent->d_inode),
187 req->r_old_dentry->d_name.len,
188 req->r_old_dentry->d_name.name,
189 path ? path : "");
190 spin_unlock(&req->r_old_dentry->d_lock);
191 kfree(path);
192 } else if (req->r_path2) {
193 if (req->r_ino2.ino)
194 seq_printf(s, " #%llx/%s", req->r_ino2.ino,
195 req->r_path2);
196 else
197 seq_printf(s, " %s", req->r_path2);
198 }
199
200 seq_printf(s, "\n");
201 }
202 mutex_unlock(&mdsc->mutex);
203
204 return 0;
205}
206
207static int osdc_show(struct seq_file *s, void *pp)
208{
209 struct ceph_client *client = s->private;
210 struct ceph_osd_client *osdc = &client->osdc;
211 struct rb_node *p;
212
213 mutex_lock(&osdc->request_mutex);
214 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
215 struct ceph_osd_request *req;
216 struct ceph_osd_request_head *head;
217 struct ceph_osd_op *op;
218 int num_ops;
219 int opcode, olen;
220 int i;
221
222 req = rb_entry(p, struct ceph_osd_request, r_node);
223
224 seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid,
225 req->r_osd ? req->r_osd->o_osd : -1,
226 le32_to_cpu(req->r_pgid.pool),
227 le16_to_cpu(req->r_pgid.ps));
228
229 head = req->r_request->front.iov_base;
230 op = (void *)(head + 1);
231
232 num_ops = le16_to_cpu(head->num_ops);
233 olen = le32_to_cpu(head->object_len);
234 seq_printf(s, "%.*s", olen,
235 (const char *)(head->ops + num_ops));
236
237 if (req->r_reassert_version.epoch)
238 seq_printf(s, "\t%u'%llu",
239 (unsigned)le32_to_cpu(req->r_reassert_version.epoch),
240 le64_to_cpu(req->r_reassert_version.version));
241 else
242 seq_printf(s, "\t");
243
244 for (i = 0; i < num_ops; i++) {
245 opcode = le16_to_cpu(op->op);
246 seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
247 op++;
248 }
249
250 seq_printf(s, "\n");
251 }
252 mutex_unlock(&osdc->request_mutex);
253 return 0;
254}
255
256static int caps_show(struct seq_file *s, void *p)
257{
258 struct ceph_client *client = p;
259 int total, avail, used, reserved, min;
260
261 ceph_reservation_status(client, &total, &avail, &used, &reserved, &min);
262 seq_printf(s, "total\t\t%d\n"
263 "avail\t\t%d\n"
264 "used\t\t%d\n"
265 "reserved\t%d\n"
266 "min\t%d\n",
267 total, avail, used, reserved, min);
268 return 0;
269}
270
271static int dentry_lru_show(struct seq_file *s, void *ptr)
272{
273 struct ceph_client *client = s->private;
274 struct ceph_mds_client *mdsc = &client->mdsc;
275 struct ceph_dentry_info *di;
276
277 spin_lock(&mdsc->dentry_lru_lock);
278 list_for_each_entry(di, &mdsc->dentry_lru, lru) {
279 struct dentry *dentry = di->dentry;
280 seq_printf(s, "%p %p\t%.*s\n",
281 di, dentry, dentry->d_name.len, dentry->d_name.name);
282 }
283 spin_unlock(&mdsc->dentry_lru_lock);
284
285 return 0;
286}
287
288#define DEFINE_SHOW_FUNC(name) \
289static int name##_open(struct inode *inode, struct file *file) \
290{ \
291 struct seq_file *sf; \
292 int ret; \
293 \
294 ret = single_open(file, name, NULL); \
295 sf = file->private_data; \
296 sf->private = inode->i_private; \
297 return ret; \
298} \
299 \
300static const struct file_operations name##_fops = { \
301 .open = name##_open, \
302 .read = seq_read, \
303 .llseek = seq_lseek, \
304 .release = single_release, \
305};
306
307DEFINE_SHOW_FUNC(monmap_show)
308DEFINE_SHOW_FUNC(mdsmap_show)
309DEFINE_SHOW_FUNC(osdmap_show)
310DEFINE_SHOW_FUNC(monc_show)
311DEFINE_SHOW_FUNC(mdsc_show)
312DEFINE_SHOW_FUNC(osdc_show)
313DEFINE_SHOW_FUNC(dentry_lru_show)
314DEFINE_SHOW_FUNC(caps_show)
315
316static int congestion_kb_set(void *data, u64 val)
317{
318 struct ceph_client *client = (struct ceph_client *)data;
319
320 if (client)
321 client->mount_args->congestion_kb = (int)val;
322
323 return 0;
324}
325
326static int congestion_kb_get(void *data, u64 *val)
327{
328 struct ceph_client *client = (struct ceph_client *)data;
329
330 if (client)
331 *val = (u64)client->mount_args->congestion_kb;
332
333 return 0;
334}
335
336
337DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get,
338 congestion_kb_set, "%llu\n");
339
340int __init ceph_debugfs_init(void)
341{
342 ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
343 if (!ceph_debugfs_dir)
344 return -ENOMEM;
345 return 0;
346}
347
348void ceph_debugfs_cleanup(void)
349{
350 debugfs_remove(ceph_debugfs_dir);
351}
352
353int ceph_debugfs_client_init(struct ceph_client *client)
354{
355 int ret = 0;
356 char name[80];
357
358 snprintf(name, sizeof(name), FSID_FORMAT ".client%lld",
359 PR_FSID(&client->fsid), client->monc.auth->global_id);
360
361 client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
362 if (!client->debugfs_dir)
363 goto out;
364
365 client->monc.debugfs_file = debugfs_create_file("monc",
366 0600,
367 client->debugfs_dir,
368 client,
369 &monc_show_fops);
370 if (!client->monc.debugfs_file)
371 goto out;
372
373 client->mdsc.debugfs_file = debugfs_create_file("mdsc",
374 0600,
375 client->debugfs_dir,
376 client,
377 &mdsc_show_fops);
378 if (!client->mdsc.debugfs_file)
379 goto out;
380
381 client->osdc.debugfs_file = debugfs_create_file("osdc",
382 0600,
383 client->debugfs_dir,
384 client,
385 &osdc_show_fops);
386 if (!client->osdc.debugfs_file)
387 goto out;
388
389 client->debugfs_monmap = debugfs_create_file("monmap",
390 0600,
391 client->debugfs_dir,
392 client,
393 &monmap_show_fops);
394 if (!client->debugfs_monmap)
395 goto out;
396
397 client->debugfs_mdsmap = debugfs_create_file("mdsmap",
398 0600,
399 client->debugfs_dir,
400 client,
401 &mdsmap_show_fops);
402 if (!client->debugfs_mdsmap)
403 goto out;
404
405 client->debugfs_osdmap = debugfs_create_file("osdmap",
406 0600,
407 client->debugfs_dir,
408 client,
409 &osdmap_show_fops);
410 if (!client->debugfs_osdmap)
411 goto out;
412
413 client->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
414 0600,
415 client->debugfs_dir,
416 client,
417 &dentry_lru_show_fops);
418 if (!client->debugfs_dentry_lru)
419 goto out;
420
421 client->debugfs_caps = debugfs_create_file("caps",
422 0400,
423 client->debugfs_dir,
424 client,
425 &caps_show_fops);
426 if (!client->debugfs_caps)
427 goto out;
428
429 client->debugfs_congestion_kb = debugfs_create_file("writeback_congestion_kb",
430 0600,
431 client->debugfs_dir,
432 client,
433 &congestion_kb_fops);
434 if (!client->debugfs_congestion_kb)
435 goto out;
436
437 sprintf(name, "../../bdi/%s", dev_name(client->sb->s_bdi->dev));
438 client->debugfs_bdi = debugfs_create_symlink("bdi", client->debugfs_dir,
439 name);
440
441 return 0;
442
443out:
444 ceph_debugfs_client_cleanup(client);
445 return ret;
446}
447
448void ceph_debugfs_client_cleanup(struct ceph_client *client)
449{
450 debugfs_remove(client->debugfs_bdi);
451 debugfs_remove(client->debugfs_caps);
452 debugfs_remove(client->debugfs_dentry_lru);
453 debugfs_remove(client->debugfs_osdmap);
454 debugfs_remove(client->debugfs_mdsmap);
455 debugfs_remove(client->debugfs_monmap);
456 debugfs_remove(client->osdc.debugfs_file);
457 debugfs_remove(client->mdsc.debugfs_file);
458 debugfs_remove(client->monc.debugfs_file);
459 debugfs_remove(client->debugfs_congestion_kb);
460 debugfs_remove(client->debugfs_dir);
461}
462
463#else // CONFIG_DEBUG_FS
464
465int __init ceph_debugfs_init(void)
466{
467 return 0;
468}
469
470void ceph_debugfs_cleanup(void)
471{
472}
473
474int ceph_debugfs_client_init(struct ceph_client *client)
475{
476 return 0;
477}
478
479void ceph_debugfs_client_cleanup(struct ceph_client *client)
480{
481}
482
483#endif // CONFIG_DEBUG_FS
diff --git a/fs/ceph/decode.h b/fs/ceph/decode.h
new file mode 100644
index 000000000000..65b3e022eaf5
--- /dev/null
+++ b/fs/ceph/decode.h
@@ -0,0 +1,194 @@
1#ifndef __CEPH_DECODE_H
2#define __CEPH_DECODE_H
3
4#include <asm/unaligned.h>
5#include <linux/time.h>
6
7#include "types.h"
8
9/*
10 * in all cases,
11 * void **p pointer to position pointer
12 * void *end pointer to end of buffer (last byte + 1)
13 */
14
15static inline u64 ceph_decode_64(void **p)
16{
17 u64 v = get_unaligned_le64(*p);
18 *p += sizeof(u64);
19 return v;
20}
21static inline u32 ceph_decode_32(void **p)
22{
23 u32 v = get_unaligned_le32(*p);
24 *p += sizeof(u32);
25 return v;
26}
27static inline u16 ceph_decode_16(void **p)
28{
29 u16 v = get_unaligned_le16(*p);
30 *p += sizeof(u16);
31 return v;
32}
33static inline u8 ceph_decode_8(void **p)
34{
35 u8 v = *(u8 *)*p;
36 (*p)++;
37 return v;
38}
39static inline void ceph_decode_copy(void **p, void *pv, size_t n)
40{
41 memcpy(pv, *p, n);
42 *p += n;
43}
44
45/*
46 * bounds check input.
47 */
48#define ceph_decode_need(p, end, n, bad) \
49 do { \
50 if (unlikely(*(p) + (n) > (end))) \
51 goto bad; \
52 } while (0)
53
54#define ceph_decode_64_safe(p, end, v, bad) \
55 do { \
56 ceph_decode_need(p, end, sizeof(u64), bad); \
57 v = ceph_decode_64(p); \
58 } while (0)
59#define ceph_decode_32_safe(p, end, v, bad) \
60 do { \
61 ceph_decode_need(p, end, sizeof(u32), bad); \
62 v = ceph_decode_32(p); \
63 } while (0)
64#define ceph_decode_16_safe(p, end, v, bad) \
65 do { \
66 ceph_decode_need(p, end, sizeof(u16), bad); \
67 v = ceph_decode_16(p); \
68 } while (0)
69#define ceph_decode_8_safe(p, end, v, bad) \
70 do { \
71 ceph_decode_need(p, end, sizeof(u8), bad); \
72 v = ceph_decode_8(p); \
73 } while (0)
74
75#define ceph_decode_copy_safe(p, end, pv, n, bad) \
76 do { \
77 ceph_decode_need(p, end, n, bad); \
78 ceph_decode_copy(p, pv, n); \
79 } while (0)
80
81/*
82 * struct ceph_timespec <-> struct timespec
83 */
84static inline void ceph_decode_timespec(struct timespec *ts,
85 const struct ceph_timespec *tv)
86{
87 ts->tv_sec = le32_to_cpu(tv->tv_sec);
88 ts->tv_nsec = le32_to_cpu(tv->tv_nsec);
89}
90static inline void ceph_encode_timespec(struct ceph_timespec *tv,
91 const struct timespec *ts)
92{
93 tv->tv_sec = cpu_to_le32(ts->tv_sec);
94 tv->tv_nsec = cpu_to_le32(ts->tv_nsec);
95}
96
97/*
98 * sockaddr_storage <-> ceph_sockaddr
99 */
100static inline void ceph_encode_addr(struct ceph_entity_addr *a)
101{
102 a->in_addr.ss_family = htons(a->in_addr.ss_family);
103}
104static inline void ceph_decode_addr(struct ceph_entity_addr *a)
105{
106 a->in_addr.ss_family = ntohs(a->in_addr.ss_family);
107 WARN_ON(a->in_addr.ss_family == 512);
108}
109
110/*
111 * encoders
112 */
113static inline void ceph_encode_64(void **p, u64 v)
114{
115 put_unaligned_le64(v, (__le64 *)*p);
116 *p += sizeof(u64);
117}
118static inline void ceph_encode_32(void **p, u32 v)
119{
120 put_unaligned_le32(v, (__le32 *)*p);
121 *p += sizeof(u32);
122}
123static inline void ceph_encode_16(void **p, u16 v)
124{
125 put_unaligned_le16(v, (__le16 *)*p);
126 *p += sizeof(u16);
127}
128static inline void ceph_encode_8(void **p, u8 v)
129{
130 *(u8 *)*p = v;
131 (*p)++;
132}
133static inline void ceph_encode_copy(void **p, const void *s, int len)
134{
135 memcpy(*p, s, len);
136 *p += len;
137}
138
139/*
140 * filepath, string encoders
141 */
142static inline void ceph_encode_filepath(void **p, void *end,
143 u64 ino, const char *path)
144{
145 u32 len = path ? strlen(path) : 0;
146 BUG_ON(*p + sizeof(ino) + sizeof(len) + len > end);
147 ceph_encode_8(p, 1);
148 ceph_encode_64(p, ino);
149 ceph_encode_32(p, len);
150 if (len)
151 memcpy(*p, path, len);
152 *p += len;
153}
154
155static inline void ceph_encode_string(void **p, void *end,
156 const char *s, u32 len)
157{
158 BUG_ON(*p + sizeof(len) + len > end);
159 ceph_encode_32(p, len);
160 if (len)
161 memcpy(*p, s, len);
162 *p += len;
163}
164
165#define ceph_encode_need(p, end, n, bad) \
166 do { \
167 if (unlikely(*(p) + (n) > (end))) \
168 goto bad; \
169 } while (0)
170
171#define ceph_encode_64_safe(p, end, v, bad) \
172 do { \
173 ceph_encode_need(p, end, sizeof(u64), bad); \
174 ceph_encode_64(p, v); \
175 } while (0)
176#define ceph_encode_32_safe(p, end, v, bad) \
177 do { \
178 ceph_encode_need(p, end, sizeof(u32), bad); \
179 ceph_encode_32(p, v); \
180 } while (0)
181#define ceph_encode_16_safe(p, end, v, bad) \
182 do { \
183 ceph_encode_need(p, end, sizeof(u16), bad); \
184 ceph_encode_16(p, v); \
185 } while (0)
186
187#define ceph_encode_copy_safe(p, end, pv, n, bad) \
188 do { \
189 ceph_encode_need(p, end, n, bad); \
190 ceph_encode_copy(p, pv, n); \
191 } while (0)
192
193
194#endif
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
new file mode 100644
index 000000000000..5107384ee029
--- /dev/null
+++ b/fs/ceph/dir.c
@@ -0,0 +1,1220 @@
1#include "ceph_debug.h"
2
3#include <linux/spinlock.h>
4#include <linux/fs_struct.h>
5#include <linux/namei.h>
6#include <linux/sched.h>
7
8#include "super.h"
9
10/*
11 * Directory operations: readdir, lookup, create, link, unlink,
12 * rename, etc.
13 */
14
15/*
16 * Ceph MDS operations are specified in terms of a base ino and
17 * relative path. Thus, the client can specify an operation on a
18 * specific inode (e.g., a getattr due to fstat(2)), or as a path
19 * relative to, say, the root directory.
20 *
21 * Normally, we limit ourselves to strict inode ops (no path component)
22 * or dentry operations (a single path component relative to an ino). The
23 * exception to this is open_root_dentry(), which will open the mount
24 * point by name.
25 */
26
27const struct inode_operations ceph_dir_iops;
28const struct file_operations ceph_dir_fops;
29struct dentry_operations ceph_dentry_ops;
30
31/*
32 * Initialize ceph dentry state.
33 */
34int ceph_init_dentry(struct dentry *dentry)
35{
36 struct ceph_dentry_info *di;
37
38 if (dentry->d_fsdata)
39 return 0;
40
41 if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
42 dentry->d_op = &ceph_dentry_ops;
43 else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR)
44 dentry->d_op = &ceph_snapdir_dentry_ops;
45 else
46 dentry->d_op = &ceph_snap_dentry_ops;
47
48 di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS);
49 if (!di)
50 return -ENOMEM; /* oh well */
51
52 spin_lock(&dentry->d_lock);
53 if (dentry->d_fsdata) /* lost a race */
54 goto out_unlock;
55 di->dentry = dentry;
56 di->lease_session = NULL;
57 dentry->d_fsdata = di;
58 dentry->d_time = jiffies;
59 ceph_dentry_lru_add(dentry);
60out_unlock:
61 spin_unlock(&dentry->d_lock);
62 return 0;
63}
64
65
66
67/*
68 * for readdir, we encode the directory frag and offset within that
69 * frag into f_pos.
70 */
71static unsigned fpos_frag(loff_t p)
72{
73 return p >> 32;
74}
75static unsigned fpos_off(loff_t p)
76{
77 return p & 0xffffffff;
78}
79
80/*
81 * When possible, we try to satisfy a readdir by peeking at the
82 * dcache. We make this work by carefully ordering dentries on
83 * d_u.d_child when we initially get results back from the MDS, and
84 * falling back to a "normal" sync readdir if any dentries in the dir
85 * are dropped.
86 *
87 * I_COMPLETE tells indicates we have all dentries in the dir. It is
88 * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
89 * the MDS if/when the directory is modified).
90 */
91static int __dcache_readdir(struct file *filp,
92 void *dirent, filldir_t filldir)
93{
94 struct inode *inode = filp->f_dentry->d_inode;
95 struct ceph_file_info *fi = filp->private_data;
96 struct dentry *parent = filp->f_dentry;
97 struct inode *dir = parent->d_inode;
98 struct list_head *p;
99 struct dentry *dentry, *last;
100 struct ceph_dentry_info *di;
101 int err = 0;
102
103 /* claim ref on last dentry we returned */
104 last = fi->dentry;
105 fi->dentry = NULL;
106
107 dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos,
108 last);
109
110 spin_lock(&dcache_lock);
111
112 /* start at beginning? */
113 if (filp->f_pos == 2 || (last &&
114 filp->f_pos < ceph_dentry(last)->offset)) {
115 if (list_empty(&parent->d_subdirs))
116 goto out_unlock;
117 p = parent->d_subdirs.prev;
118 dout(" initial p %p/%p\n", p->prev, p->next);
119 } else {
120 p = last->d_u.d_child.prev;
121 }
122
123more:
124 dentry = list_entry(p, struct dentry, d_u.d_child);
125 di = ceph_dentry(dentry);
126 while (1) {
127 dout(" p %p/%p d_subdirs %p/%p\n", p->prev, p->next,
128 parent->d_subdirs.prev, parent->d_subdirs.next);
129 if (p == &parent->d_subdirs) {
130 fi->at_end = 1;
131 goto out_unlock;
132 }
133 if (!d_unhashed(dentry) && dentry->d_inode &&
134 ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
135 ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
136 filp->f_pos <= di->offset)
137 break;
138 dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry,
139 dentry->d_name.len, dentry->d_name.name, di->offset,
140 filp->f_pos, d_unhashed(dentry) ? " unhashed" : "",
141 !dentry->d_inode ? " null" : "");
142 p = p->prev;
143 dentry = list_entry(p, struct dentry, d_u.d_child);
144 di = ceph_dentry(dentry);
145 }
146
147 atomic_inc(&dentry->d_count);
148 spin_unlock(&dcache_lock);
149 spin_unlock(&inode->i_lock);
150
151 dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
152 dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
153 filp->f_pos = di->offset;
154 err = filldir(dirent, dentry->d_name.name,
155 dentry->d_name.len, di->offset,
156 dentry->d_inode->i_ino,
157 dentry->d_inode->i_mode >> 12);
158
159 if (last) {
160 if (err < 0) {
161 /* remember our position */
162 fi->dentry = last;
163 fi->next_offset = di->offset;
164 } else {
165 dput(last);
166 }
167 last = NULL;
168 }
169
170 spin_lock(&inode->i_lock);
171 spin_lock(&dcache_lock);
172
173 if (err < 0)
174 goto out_unlock;
175
176 last = dentry;
177
178 p = p->prev;
179 filp->f_pos++;
180
181 /* make sure a dentry wasn't dropped while we didn't have dcache_lock */
182 if ((ceph_inode(dir)->i_ceph_flags & CEPH_I_COMPLETE))
183 goto more;
184 dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
185 err = -EAGAIN;
186
187out_unlock:
188 spin_unlock(&dcache_lock);
189
190 if (last) {
191 spin_unlock(&inode->i_lock);
192 dput(last);
193 spin_lock(&inode->i_lock);
194 }
195
196 return err;
197}
198
199/*
200 * make note of the last dentry we read, so we can
201 * continue at the same lexicographical point,
202 * regardless of what dir changes take place on the
203 * server.
204 */
205static int note_last_dentry(struct ceph_file_info *fi, const char *name,
206 int len)
207{
208 kfree(fi->last_name);
209 fi->last_name = kmalloc(len+1, GFP_NOFS);
210 if (!fi->last_name)
211 return -ENOMEM;
212 memcpy(fi->last_name, name, len);
213 fi->last_name[len] = 0;
214 dout("note_last_dentry '%s'\n", fi->last_name);
215 return 0;
216}
217
218static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
219{
220 struct ceph_file_info *fi = filp->private_data;
221 struct inode *inode = filp->f_dentry->d_inode;
222 struct ceph_inode_info *ci = ceph_inode(inode);
223 struct ceph_client *client = ceph_inode_to_client(inode);
224 struct ceph_mds_client *mdsc = &client->mdsc;
225 unsigned frag = fpos_frag(filp->f_pos);
226 int off = fpos_off(filp->f_pos);
227 int err;
228 u32 ftype;
229 struct ceph_mds_reply_info_parsed *rinfo;
230 const int max_entries = client->mount_args->max_readdir;
231
232 dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
233 if (fi->at_end)
234 return 0;
235
236 /* always start with . and .. */
237 if (filp->f_pos == 0) {
238 /* note dir version at start of readdir so we can tell
239 * if any dentries get dropped */
240 fi->dir_release_count = ci->i_release_count;
241
242 dout("readdir off 0 -> '.'\n");
243 if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0),
244 inode->i_ino, inode->i_mode >> 12) < 0)
245 return 0;
246 filp->f_pos = 1;
247 off = 1;
248 }
249 if (filp->f_pos == 1) {
250 dout("readdir off 1 -> '..'\n");
251 if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1),
252 filp->f_dentry->d_parent->d_inode->i_ino,
253 inode->i_mode >> 12) < 0)
254 return 0;
255 filp->f_pos = 2;
256 off = 2;
257 }
258
259 /* can we use the dcache? */
260 spin_lock(&inode->i_lock);
261 if ((filp->f_pos == 2 || fi->dentry) &&
262 !ceph_test_opt(client, NOASYNCREADDIR) &&
263 (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
264 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
265 err = __dcache_readdir(filp, dirent, filldir);
266 if (err != -EAGAIN) {
267 spin_unlock(&inode->i_lock);
268 return err;
269 }
270 }
271 spin_unlock(&inode->i_lock);
272 if (fi->dentry) {
273 err = note_last_dentry(fi, fi->dentry->d_name.name,
274 fi->dentry->d_name.len);
275 if (err)
276 return err;
277 dput(fi->dentry);
278 fi->dentry = NULL;
279 }
280
281 /* proceed with a normal readdir */
282
283more:
284 /* do we have the correct frag content buffered? */
285 if (fi->frag != frag || fi->last_readdir == NULL) {
286 struct ceph_mds_request *req;
287 int op = ceph_snap(inode) == CEPH_SNAPDIR ?
288 CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
289
290 /* discard old result, if any */
291 if (fi->last_readdir)
292 ceph_mdsc_put_request(fi->last_readdir);
293
294 /* requery frag tree, as the frag topology may have changed */
295 frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL);
296
297 dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
298 ceph_vinop(inode), frag, fi->last_name);
299 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
300 if (IS_ERR(req))
301 return PTR_ERR(req);
302 req->r_inode = igrab(inode);
303 req->r_dentry = dget(filp->f_dentry);
304 /* hints to request -> mds selection code */
305 req->r_direct_mode = USE_AUTH_MDS;
306 req->r_direct_hash = ceph_frag_value(frag);
307 req->r_direct_is_hash = true;
308 req->r_path2 = kstrdup(fi->last_name, GFP_NOFS);
309 req->r_readdir_offset = fi->next_offset;
310 req->r_args.readdir.frag = cpu_to_le32(frag);
311 req->r_args.readdir.max_entries = cpu_to_le32(max_entries);
312 req->r_num_caps = max_entries;
313 err = ceph_mdsc_do_request(mdsc, NULL, req);
314 if (err < 0) {
315 ceph_mdsc_put_request(req);
316 return err;
317 }
318 dout("readdir got and parsed readdir result=%d"
319 " on frag %x, end=%d, complete=%d\n", err, frag,
320 (int)req->r_reply_info.dir_end,
321 (int)req->r_reply_info.dir_complete);
322
323 if (!req->r_did_prepopulate) {
324 dout("readdir !did_prepopulate");
325 fi->dir_release_count--; /* preclude I_COMPLETE */
326 }
327
328 /* note next offset and last dentry name */
329 fi->offset = fi->next_offset;
330 fi->last_readdir = req;
331
332 if (req->r_reply_info.dir_end) {
333 kfree(fi->last_name);
334 fi->last_name = NULL;
335 fi->next_offset = 0;
336 } else {
337 rinfo = &req->r_reply_info;
338 err = note_last_dentry(fi,
339 rinfo->dir_dname[rinfo->dir_nr-1],
340 rinfo->dir_dname_len[rinfo->dir_nr-1]);
341 if (err)
342 return err;
343 fi->next_offset += rinfo->dir_nr;
344 }
345 }
346
347 rinfo = &fi->last_readdir->r_reply_info;
348 dout("readdir frag %x num %d off %d chunkoff %d\n", frag,
349 rinfo->dir_nr, off, fi->offset);
350 while (off - fi->offset >= 0 && off - fi->offset < rinfo->dir_nr) {
351 u64 pos = ceph_make_fpos(frag, off);
352 struct ceph_mds_reply_inode *in =
353 rinfo->dir_in[off - fi->offset].in;
354 dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
355 off, off - fi->offset, rinfo->dir_nr, pos,
356 rinfo->dir_dname_len[off - fi->offset],
357 rinfo->dir_dname[off - fi->offset], in);
358 BUG_ON(!in);
359 ftype = le32_to_cpu(in->mode) >> 12;
360 if (filldir(dirent,
361 rinfo->dir_dname[off - fi->offset],
362 rinfo->dir_dname_len[off - fi->offset],
363 pos,
364 le64_to_cpu(in->ino),
365 ftype) < 0) {
366 dout("filldir stopping us...\n");
367 return 0;
368 }
369 off++;
370 filp->f_pos = pos + 1;
371 }
372
373 if (fi->last_name) {
374 ceph_mdsc_put_request(fi->last_readdir);
375 fi->last_readdir = NULL;
376 goto more;
377 }
378
379 /* more frags? */
380 if (!ceph_frag_is_rightmost(frag)) {
381 frag = ceph_frag_next(frag);
382 off = 0;
383 filp->f_pos = ceph_make_fpos(frag, off);
384 dout("readdir next frag is %x\n", frag);
385 goto more;
386 }
387 fi->at_end = 1;
388
389 /*
390 * if dir_release_count still matches the dir, no dentries
391 * were released during the whole readdir, and we should have
392 * the complete dir contents in our cache.
393 */
394 spin_lock(&inode->i_lock);
395 if (ci->i_release_count == fi->dir_release_count) {
396 dout(" marking %p complete\n", inode);
397 ci->i_ceph_flags |= CEPH_I_COMPLETE;
398 ci->i_max_offset = filp->f_pos;
399 }
400 spin_unlock(&inode->i_lock);
401
402 dout("readdir %p filp %p done.\n", inode, filp);
403 return 0;
404}
405
406static void reset_readdir(struct ceph_file_info *fi)
407{
408 if (fi->last_readdir) {
409 ceph_mdsc_put_request(fi->last_readdir);
410 fi->last_readdir = NULL;
411 }
412 kfree(fi->last_name);
413 fi->next_offset = 2; /* compensate for . and .. */
414 if (fi->dentry) {
415 dput(fi->dentry);
416 fi->dentry = NULL;
417 }
418 fi->at_end = 0;
419}
420
421static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
422{
423 struct ceph_file_info *fi = file->private_data;
424 struct inode *inode = file->f_mapping->host;
425 loff_t old_offset = offset;
426 loff_t retval;
427
428 mutex_lock(&inode->i_mutex);
429 switch (origin) {
430 case SEEK_END:
431 offset += inode->i_size + 2; /* FIXME */
432 break;
433 case SEEK_CUR:
434 offset += file->f_pos;
435 }
436 retval = -EINVAL;
437 if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) {
438 if (offset != file->f_pos) {
439 file->f_pos = offset;
440 file->f_version = 0;
441 fi->at_end = 0;
442 }
443 retval = offset;
444
445 /*
446 * discard buffered readdir content on seekdir(0), or
447 * seek to new frag, or seek prior to current chunk.
448 */
449 if (offset == 0 ||
450 fpos_frag(offset) != fpos_frag(old_offset) ||
451 fpos_off(offset) < fi->offset) {
452 dout("dir_llseek dropping %p content\n", file);
453 reset_readdir(fi);
454 }
455
456 /* bump dir_release_count if we did a forward seek */
457 if (offset > old_offset)
458 fi->dir_release_count--;
459 }
460 mutex_unlock(&inode->i_mutex);
461 return retval;
462}
463
464/*
465 * Process result of a lookup/open request.
466 *
467 * Mainly, make sure we return the final req->r_dentry (if it already
468 * existed) in place of the original VFS-provided dentry when they
469 * differ.
470 *
471 * Gracefully handle the case where the MDS replies with -ENOENT and
472 * no trace (which it may do, at its discretion, e.g., if it doesn't
473 * care to issue a lease on the negative dentry).
474 */
475struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
476 struct dentry *dentry, int err)
477{
478 struct ceph_client *client = ceph_client(dentry->d_sb);
479 struct inode *parent = dentry->d_parent->d_inode;
480
481 /* .snap dir? */
482 if (err == -ENOENT &&
483 ceph_vino(parent).ino != CEPH_INO_ROOT && /* no .snap in root dir */
484 strcmp(dentry->d_name.name,
485 client->mount_args->snapdir_name) == 0) {
486 struct inode *inode = ceph_get_snapdir(parent);
487 dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n",
488 dentry, dentry->d_name.len, dentry->d_name.name, inode);
489 d_add(dentry, inode);
490 err = 0;
491 }
492
493 if (err == -ENOENT) {
494 /* no trace? */
495 err = 0;
496 if (!req->r_reply_info.head->is_dentry) {
497 dout("ENOENT and no trace, dentry %p inode %p\n",
498 dentry, dentry->d_inode);
499 if (dentry->d_inode) {
500 d_drop(dentry);
501 err = -ENOENT;
502 } else {
503 d_add(dentry, NULL);
504 }
505 }
506 }
507 if (err)
508 dentry = ERR_PTR(err);
509 else if (dentry != req->r_dentry)
510 dentry = dget(req->r_dentry); /* we got spliced */
511 else
512 dentry = NULL;
513 return dentry;
514}
515
516static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
517{
518 return ceph_ino(inode) == CEPH_INO_ROOT &&
519 strncmp(dentry->d_name.name, ".ceph", 5) == 0;
520}
521
522/*
523 * Look up a single dir entry. If there is a lookup intent, inform
524 * the MDS so that it gets our 'caps wanted' value in a single op.
525 */
526static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
527 struct nameidata *nd)
528{
529 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
530 struct ceph_mds_client *mdsc = &client->mdsc;
531 struct ceph_mds_request *req;
532 int op;
533 int err;
534
535 dout("lookup %p dentry %p '%.*s'\n",
536 dir, dentry, dentry->d_name.len, dentry->d_name.name);
537
538 if (dentry->d_name.len > NAME_MAX)
539 return ERR_PTR(-ENAMETOOLONG);
540
541 err = ceph_init_dentry(dentry);
542 if (err < 0)
543 return ERR_PTR(err);
544
545 /* open (but not create!) intent? */
546 if (nd &&
547 (nd->flags & LOOKUP_OPEN) &&
548 (nd->flags & LOOKUP_CONTINUE) == 0 && /* only open last component */
549 !(nd->intent.open.flags & O_CREAT)) {
550 int mode = nd->intent.open.create_mode & ~current->fs->umask;
551 return ceph_lookup_open(dir, dentry, nd, mode, 1);
552 }
553
554 /* can we conclude ENOENT locally? */
555 if (dentry->d_inode == NULL) {
556 struct ceph_inode_info *ci = ceph_inode(dir);
557 struct ceph_dentry_info *di = ceph_dentry(dentry);
558
559 spin_lock(&dir->i_lock);
560 dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
561 if (strncmp(dentry->d_name.name,
562 client->mount_args->snapdir_name,
563 dentry->d_name.len) &&
564 !is_root_ceph_dentry(dir, dentry) &&
565 (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
566 (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
567 di->offset = ci->i_max_offset++;
568 spin_unlock(&dir->i_lock);
569 dout(" dir %p complete, -ENOENT\n", dir);
570 d_add(dentry, NULL);
571 di->lease_shared_gen = ci->i_shared_gen;
572 return NULL;
573 }
574 spin_unlock(&dir->i_lock);
575 }
576
577 op = ceph_snap(dir) == CEPH_SNAPDIR ?
578 CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
579 req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
580 if (IS_ERR(req))
581 return ERR_PTR(PTR_ERR(req));
582 req->r_dentry = dget(dentry);
583 req->r_num_caps = 2;
584 /* we only need inode linkage */
585 req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
586 req->r_locked_dir = dir;
587 err = ceph_mdsc_do_request(mdsc, NULL, req);
588 dentry = ceph_finish_lookup(req, dentry, err);
589 ceph_mdsc_put_request(req); /* will dput(dentry) */
590 dout("lookup result=%p\n", dentry);
591 return dentry;
592}
593
594/*
595 * If we do a create but get no trace back from the MDS, follow up with
596 * a lookup (the VFS expects us to link up the provided dentry).
597 */
598int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
599{
600 struct dentry *result = ceph_lookup(dir, dentry, NULL);
601
602 if (result && !IS_ERR(result)) {
603 /*
604 * We created the item, then did a lookup, and found
605 * it was already linked to another inode we already
606 * had in our cache (and thus got spliced). Link our
607 * dentry to that inode, but don't hash it, just in
608 * case the VFS wants to dereference it.
609 */
610 BUG_ON(!result->d_inode);
611 d_instantiate(dentry, result->d_inode);
612 return 0;
613 }
614 return PTR_ERR(result);
615}
616
617static int ceph_mknod(struct inode *dir, struct dentry *dentry,
618 int mode, dev_t rdev)
619{
620 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
621 struct ceph_mds_client *mdsc = &client->mdsc;
622 struct ceph_mds_request *req;
623 int err;
624
625 if (ceph_snap(dir) != CEPH_NOSNAP)
626 return -EROFS;
627
628 dout("mknod in dir %p dentry %p mode 0%o rdev %d\n",
629 dir, dentry, mode, rdev);
630 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS);
631 if (IS_ERR(req)) {
632 d_drop(dentry);
633 return PTR_ERR(req);
634 }
635 req->r_dentry = dget(dentry);
636 req->r_num_caps = 2;
637 req->r_locked_dir = dir;
638 req->r_args.mknod.mode = cpu_to_le32(mode);
639 req->r_args.mknod.rdev = cpu_to_le32(rdev);
640 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
641 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
642 err = ceph_mdsc_do_request(mdsc, dir, req);
643 if (!err && !req->r_reply_info.head->is_dentry)
644 err = ceph_handle_notrace_create(dir, dentry);
645 ceph_mdsc_put_request(req);
646 if (err)
647 d_drop(dentry);
648 return err;
649}
650
651static int ceph_create(struct inode *dir, struct dentry *dentry, int mode,
652 struct nameidata *nd)
653{
654 dout("create in dir %p dentry %p name '%.*s'\n",
655 dir, dentry, dentry->d_name.len, dentry->d_name.name);
656
657 if (ceph_snap(dir) != CEPH_NOSNAP)
658 return -EROFS;
659
660 if (nd) {
661 BUG_ON((nd->flags & LOOKUP_OPEN) == 0);
662 dentry = ceph_lookup_open(dir, dentry, nd, mode, 0);
663 /* hrm, what should i do here if we get aliased? */
664 if (IS_ERR(dentry))
665 return PTR_ERR(dentry);
666 return 0;
667 }
668
669 /* fall back to mknod */
670 return ceph_mknod(dir, dentry, (mode & ~S_IFMT) | S_IFREG, 0);
671}
672
673static int ceph_symlink(struct inode *dir, struct dentry *dentry,
674 const char *dest)
675{
676 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
677 struct ceph_mds_client *mdsc = &client->mdsc;
678 struct ceph_mds_request *req;
679 int err;
680
681 if (ceph_snap(dir) != CEPH_NOSNAP)
682 return -EROFS;
683
684 dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest);
685 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS);
686 if (IS_ERR(req)) {
687 d_drop(dentry);
688 return PTR_ERR(req);
689 }
690 req->r_dentry = dget(dentry);
691 req->r_num_caps = 2;
692 req->r_path2 = kstrdup(dest, GFP_NOFS);
693 req->r_locked_dir = dir;
694 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
695 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
696 err = ceph_mdsc_do_request(mdsc, dir, req);
697 if (!err && !req->r_reply_info.head->is_dentry)
698 err = ceph_handle_notrace_create(dir, dentry);
699 ceph_mdsc_put_request(req);
700 if (err)
701 d_drop(dentry);
702 return err;
703}
704
705static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode)
706{
707 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
708 struct ceph_mds_client *mdsc = &client->mdsc;
709 struct ceph_mds_request *req;
710 int err = -EROFS;
711 int op;
712
713 if (ceph_snap(dir) == CEPH_SNAPDIR) {
714 /* mkdir .snap/foo is a MKSNAP */
715 op = CEPH_MDS_OP_MKSNAP;
716 dout("mksnap dir %p snap '%.*s' dn %p\n", dir,
717 dentry->d_name.len, dentry->d_name.name, dentry);
718 } else if (ceph_snap(dir) == CEPH_NOSNAP) {
719 dout("mkdir dir %p dn %p mode 0%o\n", dir, dentry, mode);
720 op = CEPH_MDS_OP_MKDIR;
721 } else {
722 goto out;
723 }
724 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
725 if (IS_ERR(req)) {
726 err = PTR_ERR(req);
727 goto out;
728 }
729
730 req->r_dentry = dget(dentry);
731 req->r_num_caps = 2;
732 req->r_locked_dir = dir;
733 req->r_args.mkdir.mode = cpu_to_le32(mode);
734 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
735 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
736 err = ceph_mdsc_do_request(mdsc, dir, req);
737 if (!err && !req->r_reply_info.head->is_dentry)
738 err = ceph_handle_notrace_create(dir, dentry);
739 ceph_mdsc_put_request(req);
740out:
741 if (err < 0)
742 d_drop(dentry);
743 return err;
744}
745
746static int ceph_link(struct dentry *old_dentry, struct inode *dir,
747 struct dentry *dentry)
748{
749 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
750 struct ceph_mds_client *mdsc = &client->mdsc;
751 struct ceph_mds_request *req;
752 int err;
753
754 if (ceph_snap(dir) != CEPH_NOSNAP)
755 return -EROFS;
756
757 dout("link in dir %p old_dentry %p dentry %p\n", dir,
758 old_dentry, dentry);
759 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS);
760 if (IS_ERR(req)) {
761 d_drop(dentry);
762 return PTR_ERR(req);
763 }
764 req->r_dentry = dget(dentry);
765 req->r_num_caps = 2;
766 req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */
767 req->r_locked_dir = dir;
768 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
769 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
770 err = ceph_mdsc_do_request(mdsc, dir, req);
771 if (err)
772 d_drop(dentry);
773 else if (!req->r_reply_info.head->is_dentry)
774 d_instantiate(dentry, igrab(old_dentry->d_inode));
775 ceph_mdsc_put_request(req);
776 return err;
777}
778
779/*
780 * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it
781 * looks like the link count will hit 0, drop any other caps (other
782 * than PIN) we don't specifically want (due to the file still being
783 * open).
784 */
785static int drop_caps_for_unlink(struct inode *inode)
786{
787 struct ceph_inode_info *ci = ceph_inode(inode);
788 int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
789
790 spin_lock(&inode->i_lock);
791 if (inode->i_nlink == 1) {
792 drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
793 ci->i_ceph_flags |= CEPH_I_NODELAY;
794 }
795 spin_unlock(&inode->i_lock);
796 return drop;
797}
798
799/*
800 * rmdir and unlink are differ only by the metadata op code
801 */
802static int ceph_unlink(struct inode *dir, struct dentry *dentry)
803{
804 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
805 struct ceph_mds_client *mdsc = &client->mdsc;
806 struct inode *inode = dentry->d_inode;
807 struct ceph_mds_request *req;
808 int err = -EROFS;
809 int op;
810
811 if (ceph_snap(dir) == CEPH_SNAPDIR) {
812 /* rmdir .snap/foo is RMSNAP */
813 dout("rmsnap dir %p '%.*s' dn %p\n", dir, dentry->d_name.len,
814 dentry->d_name.name, dentry);
815 op = CEPH_MDS_OP_RMSNAP;
816 } else if (ceph_snap(dir) == CEPH_NOSNAP) {
817 dout("unlink/rmdir dir %p dn %p inode %p\n",
818 dir, dentry, inode);
819 op = ((dentry->d_inode->i_mode & S_IFMT) == S_IFDIR) ?
820 CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK;
821 } else
822 goto out;
823 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
824 if (IS_ERR(req)) {
825 err = PTR_ERR(req);
826 goto out;
827 }
828 req->r_dentry = dget(dentry);
829 req->r_num_caps = 2;
830 req->r_locked_dir = dir;
831 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
832 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
833 req->r_inode_drop = drop_caps_for_unlink(inode);
834 err = ceph_mdsc_do_request(mdsc, dir, req);
835 if (!err && !req->r_reply_info.head->is_dentry)
836 d_delete(dentry);
837 ceph_mdsc_put_request(req);
838out:
839 return err;
840}
841
842static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
843 struct inode *new_dir, struct dentry *new_dentry)
844{
845 struct ceph_client *client = ceph_sb_to_client(old_dir->i_sb);
846 struct ceph_mds_client *mdsc = &client->mdsc;
847 struct ceph_mds_request *req;
848 int err;
849
850 if (ceph_snap(old_dir) != ceph_snap(new_dir))
851 return -EXDEV;
852 if (ceph_snap(old_dir) != CEPH_NOSNAP ||
853 ceph_snap(new_dir) != CEPH_NOSNAP)
854 return -EROFS;
855 dout("rename dir %p dentry %p to dir %p dentry %p\n",
856 old_dir, old_dentry, new_dir, new_dentry);
857 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS);
858 if (IS_ERR(req))
859 return PTR_ERR(req);
860 req->r_dentry = dget(new_dentry);
861 req->r_num_caps = 2;
862 req->r_old_dentry = dget(old_dentry);
863 req->r_locked_dir = new_dir;
864 req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED;
865 req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL;
866 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
867 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
868 /* release LINK_RDCACHE on source inode (mds will lock it) */
869 req->r_old_inode_drop = CEPH_CAP_LINK_SHARED;
870 if (new_dentry->d_inode)
871 req->r_inode_drop = drop_caps_for_unlink(new_dentry->d_inode);
872 err = ceph_mdsc_do_request(mdsc, old_dir, req);
873 if (!err && !req->r_reply_info.head->is_dentry) {
874 /*
875 * Normally d_move() is done by fill_trace (called by
876 * do_request, above). If there is no trace, we need
877 * to do it here.
878 */
879 d_move(old_dentry, new_dentry);
880 }
881 ceph_mdsc_put_request(req);
882 return err;
883}
884
885
886/*
887 * Check if dentry lease is valid. If not, delete the lease. Try to
888 * renew if the least is more than half up.
889 */
890static int dentry_lease_is_valid(struct dentry *dentry)
891{
892 struct ceph_dentry_info *di;
893 struct ceph_mds_session *s;
894 int valid = 0;
895 u32 gen;
896 unsigned long ttl;
897 struct ceph_mds_session *session = NULL;
898 struct inode *dir = NULL;
899 u32 seq = 0;
900
901 spin_lock(&dentry->d_lock);
902 di = ceph_dentry(dentry);
903 if (di && di->lease_session) {
904 s = di->lease_session;
905 spin_lock(&s->s_cap_lock);
906 gen = s->s_cap_gen;
907 ttl = s->s_cap_ttl;
908 spin_unlock(&s->s_cap_lock);
909
910 if (di->lease_gen == gen &&
911 time_before(jiffies, dentry->d_time) &&
912 time_before(jiffies, ttl)) {
913 valid = 1;
914 if (di->lease_renew_after &&
915 time_after(jiffies, di->lease_renew_after)) {
916 /* we should renew */
917 dir = dentry->d_parent->d_inode;
918 session = ceph_get_mds_session(s);
919 seq = di->lease_seq;
920 di->lease_renew_after = 0;
921 di->lease_renew_from = jiffies;
922 }
923 }
924 }
925 spin_unlock(&dentry->d_lock);
926
927 if (session) {
928 ceph_mdsc_lease_send_msg(session, dir, dentry,
929 CEPH_MDS_LEASE_RENEW, seq);
930 ceph_put_mds_session(session);
931 }
932 dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid);
933 return valid;
934}
935
936/*
937 * Check if directory-wide content lease/cap is valid.
938 */
939static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
940{
941 struct ceph_inode_info *ci = ceph_inode(dir);
942 struct ceph_dentry_info *di = ceph_dentry(dentry);
943 int valid = 0;
944
945 spin_lock(&dir->i_lock);
946 if (ci->i_shared_gen == di->lease_shared_gen)
947 valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1);
948 spin_unlock(&dir->i_lock);
949 dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n",
950 dir, (unsigned)ci->i_shared_gen, dentry,
951 (unsigned)di->lease_shared_gen, valid);
952 return valid;
953}
954
955/*
956 * Check if cached dentry can be trusted.
957 */
958static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
959{
960 struct inode *dir = dentry->d_parent->d_inode;
961
962 dout("d_revalidate %p '%.*s' inode %p\n", dentry,
963 dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
964
965 /* always trust cached snapped dentries, snapdir dentry */
966 if (ceph_snap(dir) != CEPH_NOSNAP) {
967 dout("d_revalidate %p '%.*s' inode %p is SNAPPED\n", dentry,
968 dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
969 goto out_touch;
970 }
971 if (dentry->d_inode && ceph_snap(dentry->d_inode) == CEPH_SNAPDIR)
972 goto out_touch;
973
974 if (dentry_lease_is_valid(dentry) ||
975 dir_lease_is_valid(dir, dentry))
976 goto out_touch;
977
978 dout("d_revalidate %p invalid\n", dentry);
979 d_drop(dentry);
980 return 0;
981out_touch:
982 ceph_dentry_lru_touch(dentry);
983 return 1;
984}
985
986/*
987 * When a dentry is released, clear the dir I_COMPLETE if it was part
988 * of the current dir gen.
989 */
990static void ceph_dentry_release(struct dentry *dentry)
991{
992 struct ceph_dentry_info *di = ceph_dentry(dentry);
993 struct inode *parent_inode = dentry->d_parent->d_inode;
994
995 if (parent_inode) {
996 struct ceph_inode_info *ci = ceph_inode(parent_inode);
997
998 spin_lock(&parent_inode->i_lock);
999 if (ci->i_shared_gen == di->lease_shared_gen) {
1000 dout(" clearing %p complete (d_release)\n",
1001 parent_inode);
1002 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
1003 ci->i_release_count++;
1004 }
1005 spin_unlock(&parent_inode->i_lock);
1006 }
1007 if (di) {
1008 ceph_dentry_lru_del(dentry);
1009 if (di->lease_session)
1010 ceph_put_mds_session(di->lease_session);
1011 kmem_cache_free(ceph_dentry_cachep, di);
1012 dentry->d_fsdata = NULL;
1013 }
1014}
1015
1016static int ceph_snapdir_d_revalidate(struct dentry *dentry,
1017 struct nameidata *nd)
1018{
1019 /*
1020 * Eventually, we'll want to revalidate snapped metadata
1021 * too... probably...
1022 */
1023 return 1;
1024}
1025
1026
1027
1028/*
1029 * read() on a dir. This weird interface hack only works if mounted
1030 * with '-o dirstat'.
1031 */
1032static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
1033 loff_t *ppos)
1034{
1035 struct ceph_file_info *cf = file->private_data;
1036 struct inode *inode = file->f_dentry->d_inode;
1037 struct ceph_inode_info *ci = ceph_inode(inode);
1038 int left;
1039
1040 if (!ceph_test_opt(ceph_client(inode->i_sb), DIRSTAT))
1041 return -EISDIR;
1042
1043 if (!cf->dir_info) {
1044 cf->dir_info = kmalloc(1024, GFP_NOFS);
1045 if (!cf->dir_info)
1046 return -ENOMEM;
1047 cf->dir_info_len =
1048 sprintf(cf->dir_info,
1049 "entries: %20lld\n"
1050 " files: %20lld\n"
1051 " subdirs: %20lld\n"
1052 "rentries: %20lld\n"
1053 " rfiles: %20lld\n"
1054 " rsubdirs: %20lld\n"
1055 "rbytes: %20lld\n"
1056 "rctime: %10ld.%09ld\n",
1057 ci->i_files + ci->i_subdirs,
1058 ci->i_files,
1059 ci->i_subdirs,
1060 ci->i_rfiles + ci->i_rsubdirs,
1061 ci->i_rfiles,
1062 ci->i_rsubdirs,
1063 ci->i_rbytes,
1064 (long)ci->i_rctime.tv_sec,
1065 (long)ci->i_rctime.tv_nsec);
1066 }
1067
1068 if (*ppos >= cf->dir_info_len)
1069 return 0;
1070 size = min_t(unsigned, size, cf->dir_info_len-*ppos);
1071 left = copy_to_user(buf, cf->dir_info + *ppos, size);
1072 if (left == size)
1073 return -EFAULT;
1074 *ppos += (size - left);
1075 return size - left;
1076}
1077
1078/*
1079 * an fsync() on a dir will wait for any uncommitted directory
1080 * operations to commit.
1081 */
1082static int ceph_dir_fsync(struct file *file, struct dentry *dentry,
1083 int datasync)
1084{
1085 struct inode *inode = dentry->d_inode;
1086 struct ceph_inode_info *ci = ceph_inode(inode);
1087 struct list_head *head = &ci->i_unsafe_dirops;
1088 struct ceph_mds_request *req;
1089 u64 last_tid;
1090 int ret = 0;
1091
1092 dout("dir_fsync %p\n", inode);
1093 spin_lock(&ci->i_unsafe_lock);
1094 if (list_empty(head))
1095 goto out;
1096
1097 req = list_entry(head->prev,
1098 struct ceph_mds_request, r_unsafe_dir_item);
1099 last_tid = req->r_tid;
1100
1101 do {
1102 ceph_mdsc_get_request(req);
1103 spin_unlock(&ci->i_unsafe_lock);
1104 dout("dir_fsync %p wait on tid %llu (until %llu)\n",
1105 inode, req->r_tid, last_tid);
1106 if (req->r_timeout) {
1107 ret = wait_for_completion_timeout(
1108 &req->r_safe_completion, req->r_timeout);
1109 if (ret > 0)
1110 ret = 0;
1111 else if (ret == 0)
1112 ret = -EIO; /* timed out */
1113 } else {
1114 wait_for_completion(&req->r_safe_completion);
1115 }
1116 spin_lock(&ci->i_unsafe_lock);
1117 ceph_mdsc_put_request(req);
1118
1119 if (ret || list_empty(head))
1120 break;
1121 req = list_entry(head->next,
1122 struct ceph_mds_request, r_unsafe_dir_item);
1123 } while (req->r_tid < last_tid);
1124out:
1125 spin_unlock(&ci->i_unsafe_lock);
1126 return ret;
1127}
1128
1129/*
1130 * We maintain a private dentry LRU.
1131 *
1132 * FIXME: this needs to be changed to a per-mds lru to be useful.
1133 */
1134void ceph_dentry_lru_add(struct dentry *dn)
1135{
1136 struct ceph_dentry_info *di = ceph_dentry(dn);
1137 struct ceph_mds_client *mdsc;
1138
1139 dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
1140 dn->d_name.len, dn->d_name.name);
1141 if (di) {
1142 mdsc = &ceph_client(dn->d_sb)->mdsc;
1143 spin_lock(&mdsc->dentry_lru_lock);
1144 list_add_tail(&di->lru, &mdsc->dentry_lru);
1145 mdsc->num_dentry++;
1146 spin_unlock(&mdsc->dentry_lru_lock);
1147 }
1148}
1149
1150void ceph_dentry_lru_touch(struct dentry *dn)
1151{
1152 struct ceph_dentry_info *di = ceph_dentry(dn);
1153 struct ceph_mds_client *mdsc;
1154
1155 dout("dentry_lru_touch %p %p '%.*s'\n", di, dn,
1156 dn->d_name.len, dn->d_name.name);
1157 if (di) {
1158 mdsc = &ceph_client(dn->d_sb)->mdsc;
1159 spin_lock(&mdsc->dentry_lru_lock);
1160 list_move_tail(&di->lru, &mdsc->dentry_lru);
1161 spin_unlock(&mdsc->dentry_lru_lock);
1162 }
1163}
1164
1165void ceph_dentry_lru_del(struct dentry *dn)
1166{
1167 struct ceph_dentry_info *di = ceph_dentry(dn);
1168 struct ceph_mds_client *mdsc;
1169
1170 dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
1171 dn->d_name.len, dn->d_name.name);
1172 if (di) {
1173 mdsc = &ceph_client(dn->d_sb)->mdsc;
1174 spin_lock(&mdsc->dentry_lru_lock);
1175 list_del_init(&di->lru);
1176 mdsc->num_dentry--;
1177 spin_unlock(&mdsc->dentry_lru_lock);
1178 }
1179}
1180
1181const struct file_operations ceph_dir_fops = {
1182 .read = ceph_read_dir,
1183 .readdir = ceph_readdir,
1184 .llseek = ceph_dir_llseek,
1185 .open = ceph_open,
1186 .release = ceph_release,
1187 .unlocked_ioctl = ceph_ioctl,
1188 .fsync = ceph_dir_fsync,
1189};
1190
1191const struct inode_operations ceph_dir_iops = {
1192 .lookup = ceph_lookup,
1193 .permission = ceph_permission,
1194 .getattr = ceph_getattr,
1195 .setattr = ceph_setattr,
1196 .setxattr = ceph_setxattr,
1197 .getxattr = ceph_getxattr,
1198 .listxattr = ceph_listxattr,
1199 .removexattr = ceph_removexattr,
1200 .mknod = ceph_mknod,
1201 .symlink = ceph_symlink,
1202 .mkdir = ceph_mkdir,
1203 .link = ceph_link,
1204 .unlink = ceph_unlink,
1205 .rmdir = ceph_unlink,
1206 .rename = ceph_rename,
1207 .create = ceph_create,
1208};
1209
1210struct dentry_operations ceph_dentry_ops = {
1211 .d_revalidate = ceph_d_revalidate,
1212 .d_release = ceph_dentry_release,
1213};
1214
1215struct dentry_operations ceph_snapdir_dentry_ops = {
1216 .d_revalidate = ceph_snapdir_d_revalidate,
1217};
1218
1219struct dentry_operations ceph_snap_dentry_ops = {
1220};
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
new file mode 100644
index 000000000000..fc68e39cbad6
--- /dev/null
+++ b/fs/ceph/export.c
@@ -0,0 +1,223 @@
1#include "ceph_debug.h"
2
3#include <linux/exportfs.h>
4#include <asm/unaligned.h>
5
6#include "super.h"
7
8/*
9 * NFS export support
10 *
11 * NFS re-export of a ceph mount is, at present, only semireliable.
12 * The basic issue is that the Ceph architectures doesn't lend itself
13 * well to generating filehandles that will remain valid forever.
14 *
15 * So, we do our best. If you're lucky, your inode will be in the
16 * client's cache. If it's not, and you have a connectable fh, then
17 * the MDS server may be able to find it for you. Otherwise, you get
18 * ESTALE.
19 *
20 * There are ways to this more reliable, but in the non-connectable fh
21 * case, we won't every work perfectly, and in the connectable case,
22 * some changes are needed on the MDS side to work better.
23 */
24
25/*
26 * Basic fh
27 */
28struct ceph_nfs_fh {
29 u64 ino;
30} __attribute__ ((packed));
31
32/*
33 * Larger 'connectable' fh that includes parent ino and name hash.
34 * Use this whenever possible, as it works more reliably.
35 */
36struct ceph_nfs_confh {
37 u64 ino, parent_ino;
38 u32 parent_name_hash;
39} __attribute__ ((packed));
40
41static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
42 int connectable)
43{
44 struct ceph_nfs_fh *fh = (void *)rawfh;
45 struct ceph_nfs_confh *cfh = (void *)rawfh;
46 struct dentry *parent = dentry->d_parent;
47 struct inode *inode = dentry->d_inode;
48 int type;
49
50 /* don't re-export snaps */
51 if (ceph_snap(inode) != CEPH_NOSNAP)
52 return -EINVAL;
53
54 if (*max_len >= sizeof(*cfh)) {
55 dout("encode_fh %p connectable\n", dentry);
56 cfh->ino = ceph_ino(dentry->d_inode);
57 cfh->parent_ino = ceph_ino(parent->d_inode);
58 cfh->parent_name_hash = parent->d_name.hash;
59 *max_len = sizeof(*cfh);
60 type = 2;
61 } else if (*max_len > sizeof(*fh)) {
62 if (connectable)
63 return -ENOSPC;
64 dout("encode_fh %p\n", dentry);
65 fh->ino = ceph_ino(dentry->d_inode);
66 *max_len = sizeof(*fh);
67 type = 1;
68 } else {
69 return -ENOSPC;
70 }
71 return type;
72}
73
74/*
75 * convert regular fh to dentry
76 *
77 * FIXME: we should try harder by querying the mds for the ino.
78 */
79static struct dentry *__fh_to_dentry(struct super_block *sb,
80 struct ceph_nfs_fh *fh)
81{
82 struct inode *inode;
83 struct dentry *dentry;
84 struct ceph_vino vino;
85 int err;
86
87 dout("__fh_to_dentry %llx\n", fh->ino);
88 vino.ino = fh->ino;
89 vino.snap = CEPH_NOSNAP;
90 inode = ceph_find_inode(sb, vino);
91 if (!inode)
92 return ERR_PTR(-ESTALE);
93
94 dentry = d_obtain_alias(inode);
95 if (!dentry) {
96 pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n",
97 fh->ino, inode);
98 iput(inode);
99 return ERR_PTR(-ENOMEM);
100 }
101 err = ceph_init_dentry(dentry);
102
103 if (err < 0) {
104 iput(inode);
105 return ERR_PTR(err);
106 }
107 dout("__fh_to_dentry %llx %p dentry %p\n", fh->ino, inode, dentry);
108 return dentry;
109}
110
111/*
112 * convert connectable fh to dentry
113 */
114static struct dentry *__cfh_to_dentry(struct super_block *sb,
115 struct ceph_nfs_confh *cfh)
116{
117 struct ceph_mds_client *mdsc = &ceph_client(sb)->mdsc;
118 struct inode *inode;
119 struct dentry *dentry;
120 struct ceph_vino vino;
121 int err;
122
123 dout("__cfh_to_dentry %llx (%llx/%x)\n",
124 cfh->ino, cfh->parent_ino, cfh->parent_name_hash);
125
126 vino.ino = cfh->ino;
127 vino.snap = CEPH_NOSNAP;
128 inode = ceph_find_inode(sb, vino);
129 if (!inode) {
130 struct ceph_mds_request *req;
131
132 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH,
133 USE_ANY_MDS);
134 if (IS_ERR(req))
135 return ERR_PTR(PTR_ERR(req));
136
137 req->r_ino1 = vino;
138 req->r_ino2.ino = cfh->parent_ino;
139 req->r_ino2.snap = CEPH_NOSNAP;
140 req->r_path2 = kmalloc(16, GFP_NOFS);
141 snprintf(req->r_path2, 16, "%d", cfh->parent_name_hash);
142 req->r_num_caps = 1;
143 err = ceph_mdsc_do_request(mdsc, NULL, req);
144 ceph_mdsc_put_request(req);
145 inode = ceph_find_inode(sb, vino);
146 if (!inode)
147 return ERR_PTR(err ? err : -ESTALE);
148 }
149
150 dentry = d_obtain_alias(inode);
151 if (!dentry) {
152 pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n",
153 cfh->ino, inode);
154 iput(inode);
155 return ERR_PTR(-ENOMEM);
156 }
157 err = ceph_init_dentry(dentry);
158 if (err < 0) {
159 iput(inode);
160 return ERR_PTR(err);
161 }
162 dout("__cfh_to_dentry %llx %p dentry %p\n", cfh->ino, inode, dentry);
163 return dentry;
164}
165
166static struct dentry *ceph_fh_to_dentry(struct super_block *sb, struct fid *fid,
167 int fh_len, int fh_type)
168{
169 if (fh_type == 1)
170 return __fh_to_dentry(sb, (struct ceph_nfs_fh *)fid->raw);
171 else
172 return __cfh_to_dentry(sb, (struct ceph_nfs_confh *)fid->raw);
173}
174
175/*
176 * get parent, if possible.
177 *
178 * FIXME: we could do better by querying the mds to discover the
179 * parent.
180 */
181static struct dentry *ceph_fh_to_parent(struct super_block *sb,
182 struct fid *fid,
183 int fh_len, int fh_type)
184{
185 struct ceph_nfs_confh *cfh = (void *)fid->raw;
186 struct ceph_vino vino;
187 struct inode *inode;
188 struct dentry *dentry;
189 int err;
190
191 if (fh_type == 1)
192 return ERR_PTR(-ESTALE);
193
194 pr_debug("fh_to_parent %llx/%d\n", cfh->parent_ino,
195 cfh->parent_name_hash);
196
197 vino.ino = cfh->ino;
198 vino.snap = CEPH_NOSNAP;
199 inode = ceph_find_inode(sb, vino);
200 if (!inode)
201 return ERR_PTR(-ESTALE);
202
203 dentry = d_obtain_alias(inode);
204 if (!dentry) {
205 pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n",
206 cfh->ino, inode);
207 iput(inode);
208 return ERR_PTR(-ENOMEM);
209 }
210 err = ceph_init_dentry(dentry);
211 if (err < 0) {
212 iput(inode);
213 return ERR_PTR(err);
214 }
215 dout("fh_to_parent %llx %p dentry %p\n", cfh->ino, inode, dentry);
216 return dentry;
217}
218
219const struct export_operations ceph_export_ops = {
220 .encode_fh = ceph_encode_fh,
221 .fh_to_dentry = ceph_fh_to_dentry,
222 .fh_to_parent = ceph_fh_to_parent,
223};
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
new file mode 100644
index 000000000000..5d2af8464f6a
--- /dev/null
+++ b/fs/ceph/file.c
@@ -0,0 +1,937 @@
1#include "ceph_debug.h"
2
3#include <linux/sched.h>
4#include <linux/file.h>
5#include <linux/namei.h>
6#include <linux/writeback.h>
7
8#include "super.h"
9#include "mds_client.h"
10
11/*
12 * Ceph file operations
13 *
14 * Implement basic open/close functionality, and implement
15 * read/write.
16 *
17 * We implement three modes of file I/O:
18 * - buffered uses the generic_file_aio_{read,write} helpers
19 *
20 * - synchronous is used when there is multi-client read/write
21 * sharing, avoids the page cache, and synchronously waits for an
22 * ack from the OSD.
23 *
24 * - direct io takes the variant of the sync path that references
25 * user pages directly.
26 *
27 * fsync() flushes and waits on dirty pages, but just queues metadata
28 * for writeback: since the MDS can recover size and mtime there is no
29 * need to wait for MDS acknowledgement.
30 */
31
32
33/*
34 * Prepare an open request. Preallocate ceph_cap to avoid an
35 * inopportune ENOMEM later.
36 */
37static struct ceph_mds_request *
38prepare_open_request(struct super_block *sb, int flags, int create_mode)
39{
40 struct ceph_client *client = ceph_sb_to_client(sb);
41 struct ceph_mds_client *mdsc = &client->mdsc;
42 struct ceph_mds_request *req;
43 int want_auth = USE_ANY_MDS;
44 int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
45
46 if (flags & (O_WRONLY|O_RDWR|O_CREAT|O_TRUNC))
47 want_auth = USE_AUTH_MDS;
48
49 req = ceph_mdsc_create_request(mdsc, op, want_auth);
50 if (IS_ERR(req))
51 goto out;
52 req->r_fmode = ceph_flags_to_mode(flags);
53 req->r_args.open.flags = cpu_to_le32(flags);
54 req->r_args.open.mode = cpu_to_le32(create_mode);
55 req->r_args.open.preferred = cpu_to_le32(-1);
56out:
57 return req;
58}
59
60/*
61 * initialize private struct file data.
62 * if we fail, clean up by dropping fmode reference on the ceph_inode
63 */
64static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
65{
66 struct ceph_file_info *cf;
67 int ret = 0;
68
69 switch (inode->i_mode & S_IFMT) {
70 case S_IFREG:
71 case S_IFDIR:
72 dout("init_file %p %p 0%o (regular)\n", inode, file,
73 inode->i_mode);
74 cf = kmem_cache_alloc(ceph_file_cachep, GFP_NOFS | __GFP_ZERO);
75 if (cf == NULL) {
76 ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
77 return -ENOMEM;
78 }
79 cf->fmode = fmode;
80 cf->next_offset = 2;
81 file->private_data = cf;
82 BUG_ON(inode->i_fop->release != ceph_release);
83 break;
84
85 case S_IFLNK:
86 dout("init_file %p %p 0%o (symlink)\n", inode, file,
87 inode->i_mode);
88 ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
89 break;
90
91 default:
92 dout("init_file %p %p 0%o (special)\n", inode, file,
93 inode->i_mode);
94 /*
95 * we need to drop the open ref now, since we don't
96 * have .release set to ceph_release.
97 */
98 ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
99 BUG_ON(inode->i_fop->release == ceph_release);
100
101 /* call the proper open fop */
102 ret = inode->i_fop->open(inode, file);
103 }
104 return ret;
105}
106
107/*
108 * If the filp already has private_data, that means the file was
109 * already opened by intent during lookup, and we do nothing.
110 *
111 * If we already have the requisite capabilities, we can satisfy
112 * the open request locally (no need to request new caps from the
113 * MDS). We do, however, need to inform the MDS (asynchronously)
114 * if our wanted caps set expands.
115 */
116int ceph_open(struct inode *inode, struct file *file)
117{
118 struct ceph_inode_info *ci = ceph_inode(inode);
119 struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
120 struct ceph_mds_client *mdsc = &client->mdsc;
121 struct ceph_mds_request *req;
122 struct ceph_file_info *cf = file->private_data;
123 struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
124 int err;
125 int flags, fmode, wanted;
126
127 if (cf) {
128 dout("open file %p is already opened\n", file);
129 return 0;
130 }
131
132 /* filter out O_CREAT|O_EXCL; vfs did that already. yuck. */
133 flags = file->f_flags & ~(O_CREAT|O_EXCL);
134 if (S_ISDIR(inode->i_mode))
135 flags = O_DIRECTORY; /* mds likes to know */
136
137 dout("open inode %p ino %llx.%llx file %p flags %d (%d)\n", inode,
138 ceph_vinop(inode), file, flags, file->f_flags);
139 fmode = ceph_flags_to_mode(flags);
140 wanted = ceph_caps_for_mode(fmode);
141
142 /* snapped files are read-only */
143 if (ceph_snap(inode) != CEPH_NOSNAP && (file->f_mode & FMODE_WRITE))
144 return -EROFS;
145
146 /* trivially open snapdir */
147 if (ceph_snap(inode) == CEPH_SNAPDIR) {
148 spin_lock(&inode->i_lock);
149 __ceph_get_fmode(ci, fmode);
150 spin_unlock(&inode->i_lock);
151 return ceph_init_file(inode, file, fmode);
152 }
153
154 /*
155 * No need to block if we have any caps. Update wanted set
156 * asynchronously.
157 */
158 spin_lock(&inode->i_lock);
159 if (__ceph_is_any_real_caps(ci)) {
160 int mds_wanted = __ceph_caps_mds_wanted(ci);
161 int issued = __ceph_caps_issued(ci, NULL);
162
163 dout("open %p fmode %d want %s issued %s using existing\n",
164 inode, fmode, ceph_cap_string(wanted),
165 ceph_cap_string(issued));
166 __ceph_get_fmode(ci, fmode);
167 spin_unlock(&inode->i_lock);
168
169 /* adjust wanted? */
170 if ((issued & wanted) != wanted &&
171 (mds_wanted & wanted) != wanted &&
172 ceph_snap(inode) != CEPH_SNAPDIR)
173 ceph_check_caps(ci, 0, NULL);
174
175 return ceph_init_file(inode, file, fmode);
176 } else if (ceph_snap(inode) != CEPH_NOSNAP &&
177 (ci->i_snap_caps & wanted) == wanted) {
178 __ceph_get_fmode(ci, fmode);
179 spin_unlock(&inode->i_lock);
180 return ceph_init_file(inode, file, fmode);
181 }
182 spin_unlock(&inode->i_lock);
183
184 dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted));
185 req = prepare_open_request(inode->i_sb, flags, 0);
186 if (IS_ERR(req)) {
187 err = PTR_ERR(req);
188 goto out;
189 }
190 req->r_inode = igrab(inode);
191 req->r_num_caps = 1;
192 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
193 if (!err)
194 err = ceph_init_file(inode, file, req->r_fmode);
195 ceph_mdsc_put_request(req);
196 dout("open result=%d on %llx.%llx\n", err, ceph_vinop(inode));
197out:
198 return err;
199}
200
201
202/*
203 * Do a lookup + open with a single request.
204 *
205 * If this succeeds, but some subsequent check in the vfs
206 * may_open() fails, the struct *file gets cleaned up (i.e.
207 * ceph_release gets called). So fear not!
208 */
209/*
210 * flags
211 * path_lookup_open -> LOOKUP_OPEN
212 * path_lookup_create -> LOOKUP_OPEN|LOOKUP_CREATE
213 */
214struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
215 struct nameidata *nd, int mode,
216 int locked_dir)
217{
218 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
219 struct ceph_mds_client *mdsc = &client->mdsc;
220 struct file *file = nd->intent.open.file;
221 struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry);
222 struct ceph_mds_request *req;
223 int err;
224 int flags = nd->intent.open.flags - 1; /* silly vfs! */
225
226 dout("ceph_lookup_open dentry %p '%.*s' flags %d mode 0%o\n",
227 dentry, dentry->d_name.len, dentry->d_name.name, flags, mode);
228
229 /* do the open */
230 req = prepare_open_request(dir->i_sb, flags, mode);
231 if (IS_ERR(req))
232 return ERR_PTR(PTR_ERR(req));
233 req->r_dentry = dget(dentry);
234 req->r_num_caps = 2;
235 if (flags & O_CREAT) {
236 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
237 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
238 }
239 req->r_locked_dir = dir; /* caller holds dir->i_mutex */
240 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
241 dentry = ceph_finish_lookup(req, dentry, err);
242 if (!err && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
243 err = ceph_handle_notrace_create(dir, dentry);
244 if (!err)
245 err = ceph_init_file(req->r_dentry->d_inode, file,
246 req->r_fmode);
247 ceph_mdsc_put_request(req);
248 dout("ceph_lookup_open result=%p\n", dentry);
249 return dentry;
250}
251
252int ceph_release(struct inode *inode, struct file *file)
253{
254 struct ceph_inode_info *ci = ceph_inode(inode);
255 struct ceph_file_info *cf = file->private_data;
256
257 dout("release inode %p file %p\n", inode, file);
258 ceph_put_fmode(ci, cf->fmode);
259 if (cf->last_readdir)
260 ceph_mdsc_put_request(cf->last_readdir);
261 kfree(cf->last_name);
262 kfree(cf->dir_info);
263 dput(cf->dentry);
264 kmem_cache_free(ceph_file_cachep, cf);
265
266 /* wake up anyone waiting for caps on this inode */
267 wake_up(&ci->i_cap_wq);
268 return 0;
269}
270
271/*
272 * build a vector of user pages
273 */
274static struct page **get_direct_page_vector(const char __user *data,
275 int num_pages,
276 loff_t off, size_t len)
277{
278 struct page **pages;
279 int rc;
280
281 pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
282 if (!pages)
283 return ERR_PTR(-ENOMEM);
284
285 down_read(&current->mm->mmap_sem);
286 rc = get_user_pages(current, current->mm, (unsigned long)data,
287 num_pages, 0, 0, pages, NULL);
288 up_read(&current->mm->mmap_sem);
289 if (rc < 0)
290 goto fail;
291 return pages;
292
293fail:
294 kfree(pages);
295 return ERR_PTR(rc);
296}
297
298static void put_page_vector(struct page **pages, int num_pages)
299{
300 int i;
301
302 for (i = 0; i < num_pages; i++)
303 put_page(pages[i]);
304 kfree(pages);
305}
306
307void ceph_release_page_vector(struct page **pages, int num_pages)
308{
309 int i;
310
311 for (i = 0; i < num_pages; i++)
312 __free_pages(pages[i], 0);
313 kfree(pages);
314}
315
316/*
317 * allocate a vector new pages
318 */
319static struct page **alloc_page_vector(int num_pages)
320{
321 struct page **pages;
322 int i;
323
324 pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
325 if (!pages)
326 return ERR_PTR(-ENOMEM);
327 for (i = 0; i < num_pages; i++) {
328 pages[i] = alloc_page(GFP_NOFS);
329 if (pages[i] == NULL) {
330 ceph_release_page_vector(pages, i);
331 return ERR_PTR(-ENOMEM);
332 }
333 }
334 return pages;
335}
336
337/*
338 * copy user data into a page vector
339 */
340static int copy_user_to_page_vector(struct page **pages,
341 const char __user *data,
342 loff_t off, size_t len)
343{
344 int i = 0;
345 int po = off & ~PAGE_CACHE_MASK;
346 int left = len;
347 int l, bad;
348
349 while (left > 0) {
350 l = min_t(int, PAGE_CACHE_SIZE-po, left);
351 bad = copy_from_user(page_address(pages[i]) + po, data, l);
352 if (bad == l)
353 return -EFAULT;
354 data += l - bad;
355 left -= l - bad;
356 po += l - bad;
357 if (po == PAGE_CACHE_SIZE) {
358 po = 0;
359 i++;
360 }
361 }
362 return len;
363}
364
365/*
366 * copy user data from a page vector into a user pointer
367 */
368static int copy_page_vector_to_user(struct page **pages, char __user *data,
369 loff_t off, size_t len)
370{
371 int i = 0;
372 int po = off & ~PAGE_CACHE_MASK;
373 int left = len;
374 int l, bad;
375
376 while (left > 0) {
377 l = min_t(int, left, PAGE_CACHE_SIZE-po);
378 bad = copy_to_user(data, page_address(pages[i]) + po, l);
379 if (bad == l)
380 return -EFAULT;
381 data += l - bad;
382 left -= l - bad;
383 if (po) {
384 po += l - bad;
385 if (po == PAGE_CACHE_SIZE)
386 po = 0;
387 }
388 i++;
389 }
390 return len;
391}
392
393/*
394 * Zero an extent within a page vector. Offset is relative to the
395 * start of the first page.
396 */
397static void zero_page_vector_range(int off, int len, struct page **pages)
398{
399 int i = off >> PAGE_CACHE_SHIFT;
400
401 off &= ~PAGE_CACHE_MASK;
402
403 dout("zero_page_vector_page %u~%u\n", off, len);
404
405 /* leading partial page? */
406 if (off) {
407 int end = min((int)PAGE_CACHE_SIZE, off + len);
408 dout("zeroing %d %p head from %d\n", i, pages[i],
409 (int)off);
410 zero_user_segment(pages[i], off, end);
411 len -= (end - off);
412 i++;
413 }
414 while (len >= PAGE_CACHE_SIZE) {
415 dout("zeroing %d %p len=%d\n", i, pages[i], len);
416 zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
417 len -= PAGE_CACHE_SIZE;
418 i++;
419 }
420 /* trailing partial page? */
421 if (len) {
422 dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len);
423 zero_user_segment(pages[i], 0, len);
424 }
425}
426
427
428/*
429 * Read a range of bytes striped over one or more objects. Iterate over
430 * objects we stripe over. (That's not atomic, but good enough for now.)
431 *
432 * If we get a short result from the OSD, check against i_size; we need to
433 * only return a short read to the caller if we hit EOF.
434 */
435static int striped_read(struct inode *inode,
436 u64 off, u64 len,
437 struct page **pages, int num_pages,
438 int *checkeof)
439{
440 struct ceph_client *client = ceph_inode_to_client(inode);
441 struct ceph_inode_info *ci = ceph_inode(inode);
442 u64 pos, this_len;
443 int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */
444 int left, pages_left;
445 int read;
446 struct page **page_pos;
447 int ret;
448 bool hit_stripe, was_short;
449
450 /*
451 * we may need to do multiple reads. not atomic, unfortunately.
452 */
453 pos = off;
454 left = len;
455 page_pos = pages;
456 pages_left = num_pages;
457 read = 0;
458
459more:
460 this_len = left;
461 ret = ceph_osdc_readpages(&client->osdc, ceph_vino(inode),
462 &ci->i_layout, pos, &this_len,
463 ci->i_truncate_seq,
464 ci->i_truncate_size,
465 page_pos, pages_left);
466 hit_stripe = this_len < left;
467 was_short = ret >= 0 && ret < this_len;
468 if (ret == -ENOENT)
469 ret = 0;
470 dout("striped_read %llu~%u (read %u) got %d%s%s\n", pos, left, read,
471 ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");
472
473 if (ret > 0) {
474 int didpages =
475 ((pos & ~PAGE_CACHE_MASK) + ret) >> PAGE_CACHE_SHIFT;
476
477 if (read < pos - off) {
478 dout(" zero gap %llu to %llu\n", off + read, pos);
479 zero_page_vector_range(page_off + read,
480 pos - off - read, pages);
481 }
482 pos += ret;
483 read = pos - off;
484 left -= ret;
485 page_pos += didpages;
486 pages_left -= didpages;
487
488 /* hit stripe? */
489 if (left && hit_stripe)
490 goto more;
491 }
492
493 if (was_short) {
494 /* was original extent fully inside i_size? */
495 if (pos + left <= inode->i_size) {
496 dout("zero tail\n");
497 zero_page_vector_range(page_off + read, len - read,
498 pages);
499 read = len;
500 goto out;
501 }
502
503 /* check i_size */
504 *checkeof = 1;
505 }
506
507out:
508 if (ret >= 0)
509 ret = read;
510 dout("striped_read returns %d\n", ret);
511 return ret;
512}
513
514/*
515 * Completely synchronous read and write methods. Direct from __user
516 * buffer to osd, or directly to user pages (if O_DIRECT).
517 *
518 * If the read spans object boundary, just do multiple reads.
519 */
520static ssize_t ceph_sync_read(struct file *file, char __user *data,
521 unsigned len, loff_t *poff, int *checkeof)
522{
523 struct inode *inode = file->f_dentry->d_inode;
524 struct page **pages;
525 u64 off = *poff;
526 int num_pages = calc_pages_for(off, len);
527 int ret;
528
529 dout("sync_read on file %p %llu~%u %s\n", file, off, len,
530 (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
531
532 if (file->f_flags & O_DIRECT) {
533 pages = get_direct_page_vector(data, num_pages, off, len);
534
535 /*
536 * flush any page cache pages in this range. this
537 * will make concurrent normal and O_DIRECT io slow,
538 * but it will at least behave sensibly when they are
539 * in sequence.
540 */
541 } else {
542 pages = alloc_page_vector(num_pages);
543 }
544 if (IS_ERR(pages))
545 return PTR_ERR(pages);
546
547 ret = filemap_write_and_wait(inode->i_mapping);
548 if (ret < 0)
549 goto done;
550
551 ret = striped_read(inode, off, len, pages, num_pages, checkeof);
552
553 if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
554 ret = copy_page_vector_to_user(pages, data, off, ret);
555 if (ret >= 0)
556 *poff = off + ret;
557
558done:
559 if (file->f_flags & O_DIRECT)
560 put_page_vector(pages, num_pages);
561 else
562 ceph_release_page_vector(pages, num_pages);
563 dout("sync_read result %d\n", ret);
564 return ret;
565}
566
567/*
568 * Write commit callback, called if we requested both an ACK and
569 * ONDISK commit reply from the OSD.
570 */
571static void sync_write_commit(struct ceph_osd_request *req,
572 struct ceph_msg *msg)
573{
574 struct ceph_inode_info *ci = ceph_inode(req->r_inode);
575
576 dout("sync_write_commit %p tid %llu\n", req, req->r_tid);
577 spin_lock(&ci->i_unsafe_lock);
578 list_del_init(&req->r_unsafe_item);
579 spin_unlock(&ci->i_unsafe_lock);
580 ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
581}
582
583/*
584 * Synchronous write, straight from __user pointer or user pages (if
585 * O_DIRECT).
586 *
587 * If write spans object boundary, just do multiple writes. (For a
588 * correct atomic write, we should e.g. take write locks on all
589 * objects, rollback on failure, etc.)
590 */
591static ssize_t ceph_sync_write(struct file *file, const char __user *data,
592 size_t left, loff_t *offset)
593{
594 struct inode *inode = file->f_dentry->d_inode;
595 struct ceph_inode_info *ci = ceph_inode(inode);
596 struct ceph_client *client = ceph_inode_to_client(inode);
597 struct ceph_osd_request *req;
598 struct page **pages;
599 int num_pages;
600 long long unsigned pos;
601 u64 len;
602 int written = 0;
603 int flags;
604 int do_sync = 0;
605 int check_caps = 0;
606 int ret;
607 struct timespec mtime = CURRENT_TIME;
608
609 if (ceph_snap(file->f_dentry->d_inode) != CEPH_NOSNAP)
610 return -EROFS;
611
612 dout("sync_write on file %p %lld~%u %s\n", file, *offset,
613 (unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
614
615 if (file->f_flags & O_APPEND)
616 pos = i_size_read(inode);
617 else
618 pos = *offset;
619
620 ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left);
621 if (ret < 0)
622 return ret;
623
624 ret = invalidate_inode_pages2_range(inode->i_mapping,
625 pos >> PAGE_CACHE_SHIFT,
626 (pos + left) >> PAGE_CACHE_SHIFT);
627 if (ret < 0)
628 dout("invalidate_inode_pages2_range returned %d\n", ret);
629
630 flags = CEPH_OSD_FLAG_ORDERSNAP |
631 CEPH_OSD_FLAG_ONDISK |
632 CEPH_OSD_FLAG_WRITE;
633 if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0)
634 flags |= CEPH_OSD_FLAG_ACK;
635 else
636 do_sync = 1;
637
638 /*
639 * we may need to do multiple writes here if we span an object
640 * boundary. this isn't atomic, unfortunately. :(
641 */
642more:
643 len = left;
644 req = ceph_osdc_new_request(&client->osdc, &ci->i_layout,
645 ceph_vino(inode), pos, &len,
646 CEPH_OSD_OP_WRITE, flags,
647 ci->i_snap_realm->cached_context,
648 do_sync,
649 ci->i_truncate_seq, ci->i_truncate_size,
650 &mtime, false, 2);
651 if (IS_ERR(req))
652 return PTR_ERR(req);
653
654 num_pages = calc_pages_for(pos, len);
655
656 if (file->f_flags & O_DIRECT) {
657 pages = get_direct_page_vector(data, num_pages, pos, len);
658 if (IS_ERR(pages)) {
659 ret = PTR_ERR(pages);
660 goto out;
661 }
662
663 /*
664 * throw out any page cache pages in this range. this
665 * may block.
666 */
667 truncate_inode_pages_range(inode->i_mapping, pos, pos+len);
668 } else {
669 pages = alloc_page_vector(num_pages);
670 if (IS_ERR(pages)) {
671 ret = PTR_ERR(pages);
672 goto out;
673 }
674 ret = copy_user_to_page_vector(pages, data, pos, len);
675 if (ret < 0) {
676 ceph_release_page_vector(pages, num_pages);
677 goto out;
678 }
679
680 if ((file->f_flags & O_SYNC) == 0) {
681 /* get a second commit callback */
682 req->r_safe_callback = sync_write_commit;
683 req->r_own_pages = 1;
684 }
685 }
686 req->r_pages = pages;
687 req->r_num_pages = num_pages;
688 req->r_inode = inode;
689
690 ret = ceph_osdc_start_request(&client->osdc, req, false);
691 if (!ret) {
692 if (req->r_safe_callback) {
693 /*
694 * Add to inode unsafe list only after we
695 * start_request so that a tid has been assigned.
696 */
697 spin_lock(&ci->i_unsafe_lock);
698 list_add(&ci->i_unsafe_writes, &req->r_unsafe_item);
699 spin_unlock(&ci->i_unsafe_lock);
700 ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
701 }
702 ret = ceph_osdc_wait_request(&client->osdc, req);
703 }
704
705 if (file->f_flags & O_DIRECT)
706 put_page_vector(pages, num_pages);
707 else if (file->f_flags & O_SYNC)
708 ceph_release_page_vector(pages, num_pages);
709
710out:
711 ceph_osdc_put_request(req);
712 if (ret == 0) {
713 pos += len;
714 written += len;
715 left -= len;
716 if (left)
717 goto more;
718
719 ret = written;
720 *offset = pos;
721 if (pos > i_size_read(inode))
722 check_caps = ceph_inode_set_size(inode, pos);
723 if (check_caps)
724 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY,
725 NULL);
726 }
727 return ret;
728}
729
730/*
731 * Wrap generic_file_aio_read with checks for cap bits on the inode.
732 * Atomically grab references, so that those bits are not released
733 * back to the MDS mid-read.
734 *
735 * Hmm, the sync read case isn't actually async... should it be?
736 */
737static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov,
738 unsigned long nr_segs, loff_t pos)
739{
740 struct file *filp = iocb->ki_filp;
741 loff_t *ppos = &iocb->ki_pos;
742 size_t len = iov->iov_len;
743 struct inode *inode = filp->f_dentry->d_inode;
744 struct ceph_inode_info *ci = ceph_inode(inode);
745 void *base = iov->iov_base;
746 ssize_t ret;
747 int got = 0;
748 int checkeof = 0, read = 0;
749
750 dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
751 inode, ceph_vinop(inode), pos, (unsigned)len, inode);
752again:
753 __ceph_do_pending_vmtruncate(inode);
754 ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_CACHE,
755 &got, -1);
756 if (ret < 0)
757 goto out;
758 dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
759 inode, ceph_vinop(inode), pos, (unsigned)len,
760 ceph_cap_string(got));
761
762 if ((got & CEPH_CAP_FILE_CACHE) == 0 ||
763 (iocb->ki_filp->f_flags & O_DIRECT) ||
764 (inode->i_sb->s_flags & MS_SYNCHRONOUS))
765 /* hmm, this isn't really async... */
766 ret = ceph_sync_read(filp, base, len, ppos, &checkeof);
767 else
768 ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
769
770out:
771 dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
772 inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
773 ceph_put_cap_refs(ci, got);
774
775 if (checkeof && ret >= 0) {
776 int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
777
778 /* hit EOF or hole? */
779 if (statret == 0 && *ppos < inode->i_size) {
780 dout("aio_read sync_read hit hole, reading more\n");
781 read += ret;
782 base += ret;
783 len -= ret;
784 checkeof = 0;
785 goto again;
786 }
787 }
788 if (ret >= 0)
789 ret += read;
790
791 return ret;
792}
793
794/*
795 * Take cap references to avoid releasing caps to MDS mid-write.
796 *
797 * If we are synchronous, and write with an old snap context, the OSD
798 * may return EOLDSNAPC. In that case, retry the write.. _after_
799 * dropping our cap refs and allowing the pending snap to logically
800 * complete _before_ this write occurs.
801 *
802 * If we are near ENOSPC, write synchronously.
803 */
804static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
805 unsigned long nr_segs, loff_t pos)
806{
807 struct file *file = iocb->ki_filp;
808 struct inode *inode = file->f_dentry->d_inode;
809 struct ceph_inode_info *ci = ceph_inode(inode);
810 struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc;
811 loff_t endoff = pos + iov->iov_len;
812 int got = 0;
813 int ret, err;
814
815 if (ceph_snap(inode) != CEPH_NOSNAP)
816 return -EROFS;
817
818retry_snap:
819 if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL))
820 return -ENOSPC;
821 __ceph_do_pending_vmtruncate(inode);
822 dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
823 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
824 inode->i_size);
825 ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER,
826 &got, endoff);
827 if (ret < 0)
828 goto out;
829
830 dout("aio_write %p %llx.%llx %llu~%u got cap refs on %s\n",
831 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
832 ceph_cap_string(got));
833
834 if ((got & CEPH_CAP_FILE_BUFFER) == 0 ||
835 (iocb->ki_filp->f_flags & O_DIRECT) ||
836 (inode->i_sb->s_flags & MS_SYNCHRONOUS)) {
837 ret = ceph_sync_write(file, iov->iov_base, iov->iov_len,
838 &iocb->ki_pos);
839 } else {
840 ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
841
842 if ((ret >= 0 || ret == -EIOCBQUEUED) &&
843 ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host)
844 || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) {
845 err = vfs_fsync_range(file, file->f_path.dentry,
846 pos, pos + ret - 1, 1);
847 if (err < 0)
848 ret = err;
849 }
850 }
851 if (ret >= 0) {
852 spin_lock(&inode->i_lock);
853 __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
854 spin_unlock(&inode->i_lock);
855 }
856
857out:
858 dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n",
859 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
860 ceph_cap_string(got));
861 ceph_put_cap_refs(ci, got);
862
863 if (ret == -EOLDSNAPC) {
864 dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n",
865 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len);
866 goto retry_snap;
867 }
868
869 return ret;
870}
871
872/*
873 * llseek. be sure to verify file size on SEEK_END.
874 */
875static loff_t ceph_llseek(struct file *file, loff_t offset, int origin)
876{
877 struct inode *inode = file->f_mapping->host;
878 int ret;
879
880 mutex_lock(&inode->i_mutex);
881 __ceph_do_pending_vmtruncate(inode);
882 switch (origin) {
883 case SEEK_END:
884 ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
885 if (ret < 0) {
886 offset = ret;
887 goto out;
888 }
889 offset += inode->i_size;
890 break;
891 case SEEK_CUR:
892 /*
893 * Here we special-case the lseek(fd, 0, SEEK_CUR)
894 * position-querying operation. Avoid rewriting the "same"
895 * f_pos value back to the file because a concurrent read(),
896 * write() or lseek() might have altered it
897 */
898 if (offset == 0) {
899 offset = file->f_pos;
900 goto out;
901 }
902 offset += file->f_pos;
903 break;
904 }
905
906 if (offset < 0 || offset > inode->i_sb->s_maxbytes) {
907 offset = -EINVAL;
908 goto out;
909 }
910
911 /* Special lock needed here? */
912 if (offset != file->f_pos) {
913 file->f_pos = offset;
914 file->f_version = 0;
915 }
916
917out:
918 mutex_unlock(&inode->i_mutex);
919 return offset;
920}
921
922const struct file_operations ceph_file_fops = {
923 .open = ceph_open,
924 .release = ceph_release,
925 .llseek = ceph_llseek,
926 .read = do_sync_read,
927 .write = do_sync_write,
928 .aio_read = ceph_aio_read,
929 .aio_write = ceph_aio_write,
930 .mmap = ceph_mmap,
931 .fsync = ceph_fsync,
932 .splice_read = generic_file_splice_read,
933 .splice_write = generic_file_splice_write,
934 .unlocked_ioctl = ceph_ioctl,
935 .compat_ioctl = ceph_ioctl,
936};
937
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
new file mode 100644
index 000000000000..7abe1aed819b
--- /dev/null
+++ b/fs/ceph/inode.c
@@ -0,0 +1,1750 @@
1#include "ceph_debug.h"
2
3#include <linux/module.h>
4#include <linux/fs.h>
5#include <linux/smp_lock.h>
6#include <linux/slab.h>
7#include <linux/string.h>
8#include <linux/uaccess.h>
9#include <linux/kernel.h>
10#include <linux/namei.h>
11#include <linux/writeback.h>
12#include <linux/vmalloc.h>
13#include <linux/pagevec.h>
14
15#include "super.h"
16#include "decode.h"
17
18/*
19 * Ceph inode operations
20 *
21 * Implement basic inode helpers (get, alloc) and inode ops (getattr,
22 * setattr, etc.), xattr helpers, and helpers for assimilating
23 * metadata returned by the MDS into our cache.
24 *
25 * Also define helpers for doing asynchronous writeback, invalidation,
26 * and truncation for the benefit of those who can't afford to block
27 * (typically because they are in the message handler path).
28 */
29
30static const struct inode_operations ceph_symlink_iops;
31
32static void ceph_invalidate_work(struct work_struct *work);
33static void ceph_writeback_work(struct work_struct *work);
34static void ceph_vmtruncate_work(struct work_struct *work);
35
36/*
37 * find or create an inode, given the ceph ino number
38 */
39struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
40{
41 struct inode *inode;
42 ino_t t = ceph_vino_to_ino(vino);
43
44 inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino);
45 if (inode == NULL)
46 return ERR_PTR(-ENOMEM);
47 if (inode->i_state & I_NEW) {
48 dout("get_inode created new inode %p %llx.%llx ino %llx\n",
49 inode, ceph_vinop(inode), (u64)inode->i_ino);
50 unlock_new_inode(inode);
51 }
52
53 dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino,
54 vino.snap, inode);
55 return inode;
56}
57
58/*
59 * get/constuct snapdir inode for a given directory
60 */
61struct inode *ceph_get_snapdir(struct inode *parent)
62{
63 struct ceph_vino vino = {
64 .ino = ceph_ino(parent),
65 .snap = CEPH_SNAPDIR,
66 };
67 struct inode *inode = ceph_get_inode(parent->i_sb, vino);
68 struct ceph_inode_info *ci = ceph_inode(inode);
69
70 BUG_ON(!S_ISDIR(parent->i_mode));
71 if (IS_ERR(inode))
72 return ERR_PTR(PTR_ERR(inode));
73 inode->i_mode = parent->i_mode;
74 inode->i_uid = parent->i_uid;
75 inode->i_gid = parent->i_gid;
76 inode->i_op = &ceph_dir_iops;
77 inode->i_fop = &ceph_dir_fops;
78 ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
79 ci->i_rbytes = 0;
80 return inode;
81}
82
83const struct inode_operations ceph_file_iops = {
84 .permission = ceph_permission,
85 .setattr = ceph_setattr,
86 .getattr = ceph_getattr,
87 .setxattr = ceph_setxattr,
88 .getxattr = ceph_getxattr,
89 .listxattr = ceph_listxattr,
90 .removexattr = ceph_removexattr,
91};
92
93
94/*
95 * We use a 'frag tree' to keep track of the MDS's directory fragments
96 * for a given inode (usually there is just a single fragment). We
97 * need to know when a child frag is delegated to a new MDS, or when
98 * it is flagged as replicated, so we can direct our requests
99 * accordingly.
100 */
101
102/*
103 * find/create a frag in the tree
104 */
105static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci,
106 u32 f)
107{
108 struct rb_node **p;
109 struct rb_node *parent = NULL;
110 struct ceph_inode_frag *frag;
111 int c;
112
113 p = &ci->i_fragtree.rb_node;
114 while (*p) {
115 parent = *p;
116 frag = rb_entry(parent, struct ceph_inode_frag, node);
117 c = ceph_frag_compare(f, frag->frag);
118 if (c < 0)
119 p = &(*p)->rb_left;
120 else if (c > 0)
121 p = &(*p)->rb_right;
122 else
123 return frag;
124 }
125
126 frag = kmalloc(sizeof(*frag), GFP_NOFS);
127 if (!frag) {
128 pr_err("__get_or_create_frag ENOMEM on %p %llx.%llx "
129 "frag %x\n", &ci->vfs_inode,
130 ceph_vinop(&ci->vfs_inode), f);
131 return ERR_PTR(-ENOMEM);
132 }
133 frag->frag = f;
134 frag->split_by = 0;
135 frag->mds = -1;
136 frag->ndist = 0;
137
138 rb_link_node(&frag->node, parent, p);
139 rb_insert_color(&frag->node, &ci->i_fragtree);
140
141 dout("get_or_create_frag added %llx.%llx frag %x\n",
142 ceph_vinop(&ci->vfs_inode), f);
143 return frag;
144}
145
146/*
147 * find a specific frag @f
148 */
149struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f)
150{
151 struct rb_node *n = ci->i_fragtree.rb_node;
152
153 while (n) {
154 struct ceph_inode_frag *frag =
155 rb_entry(n, struct ceph_inode_frag, node);
156 int c = ceph_frag_compare(f, frag->frag);
157 if (c < 0)
158 n = n->rb_left;
159 else if (c > 0)
160 n = n->rb_right;
161 else
162 return frag;
163 }
164 return NULL;
165}
166
167/*
168 * Choose frag containing the given value @v. If @pfrag is
169 * specified, copy the frag delegation info to the caller if
170 * it is present.
171 */
172u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
173 struct ceph_inode_frag *pfrag,
174 int *found)
175{
176 u32 t = ceph_frag_make(0, 0);
177 struct ceph_inode_frag *frag;
178 unsigned nway, i;
179 u32 n;
180
181 if (found)
182 *found = 0;
183
184 mutex_lock(&ci->i_fragtree_mutex);
185 while (1) {
186 WARN_ON(!ceph_frag_contains_value(t, v));
187 frag = __ceph_find_frag(ci, t);
188 if (!frag)
189 break; /* t is a leaf */
190 if (frag->split_by == 0) {
191 if (pfrag)
192 memcpy(pfrag, frag, sizeof(*pfrag));
193 if (found)
194 *found = 1;
195 break;
196 }
197
198 /* choose child */
199 nway = 1 << frag->split_by;
200 dout("choose_frag(%x) %x splits by %d (%d ways)\n", v, t,
201 frag->split_by, nway);
202 for (i = 0; i < nway; i++) {
203 n = ceph_frag_make_child(t, frag->split_by, i);
204 if (ceph_frag_contains_value(n, v)) {
205 t = n;
206 break;
207 }
208 }
209 BUG_ON(i == nway);
210 }
211 dout("choose_frag(%x) = %x\n", v, t);
212
213 mutex_unlock(&ci->i_fragtree_mutex);
214 return t;
215}
216
217/*
218 * Process dirfrag (delegation) info from the mds. Include leaf
219 * fragment in tree ONLY if ndist > 0. Otherwise, only
220 * branches/splits are included in i_fragtree)
221 */
222static int ceph_fill_dirfrag(struct inode *inode,
223 struct ceph_mds_reply_dirfrag *dirinfo)
224{
225 struct ceph_inode_info *ci = ceph_inode(inode);
226 struct ceph_inode_frag *frag;
227 u32 id = le32_to_cpu(dirinfo->frag);
228 int mds = le32_to_cpu(dirinfo->auth);
229 int ndist = le32_to_cpu(dirinfo->ndist);
230 int i;
231 int err = 0;
232
233 mutex_lock(&ci->i_fragtree_mutex);
234 if (ndist == 0) {
235 /* no delegation info needed. */
236 frag = __ceph_find_frag(ci, id);
237 if (!frag)
238 goto out;
239 if (frag->split_by == 0) {
240 /* tree leaf, remove */
241 dout("fill_dirfrag removed %llx.%llx frag %x"
242 " (no ref)\n", ceph_vinop(inode), id);
243 rb_erase(&frag->node, &ci->i_fragtree);
244 kfree(frag);
245 } else {
246 /* tree branch, keep and clear */
247 dout("fill_dirfrag cleared %llx.%llx frag %x"
248 " referral\n", ceph_vinop(inode), id);
249 frag->mds = -1;
250 frag->ndist = 0;
251 }
252 goto out;
253 }
254
255
256 /* find/add this frag to store mds delegation info */
257 frag = __get_or_create_frag(ci, id);
258 if (IS_ERR(frag)) {
259 /* this is not the end of the world; we can continue
260 with bad/inaccurate delegation info */
261 pr_err("fill_dirfrag ENOMEM on mds ref %llx.%llx fg %x\n",
262 ceph_vinop(inode), le32_to_cpu(dirinfo->frag));
263 err = -ENOMEM;
264 goto out;
265 }
266
267 frag->mds = mds;
268 frag->ndist = min_t(u32, ndist, CEPH_MAX_DIRFRAG_REP);
269 for (i = 0; i < frag->ndist; i++)
270 frag->dist[i] = le32_to_cpu(dirinfo->dist[i]);
271 dout("fill_dirfrag %llx.%llx frag %x ndist=%d\n",
272 ceph_vinop(inode), frag->frag, frag->ndist);
273
274out:
275 mutex_unlock(&ci->i_fragtree_mutex);
276 return err;
277}
278
279
280/*
281 * initialize a newly allocated inode.
282 */
283struct inode *ceph_alloc_inode(struct super_block *sb)
284{
285 struct ceph_inode_info *ci;
286 int i;
287
288 ci = kmem_cache_alloc(ceph_inode_cachep, GFP_NOFS);
289 if (!ci)
290 return NULL;
291
292 dout("alloc_inode %p\n", &ci->vfs_inode);
293
294 ci->i_version = 0;
295 ci->i_time_warp_seq = 0;
296 ci->i_ceph_flags = 0;
297 ci->i_release_count = 0;
298 ci->i_symlink = NULL;
299
300 ci->i_fragtree = RB_ROOT;
301 mutex_init(&ci->i_fragtree_mutex);
302
303 ci->i_xattrs.blob = NULL;
304 ci->i_xattrs.prealloc_blob = NULL;
305 ci->i_xattrs.dirty = false;
306 ci->i_xattrs.index = RB_ROOT;
307 ci->i_xattrs.count = 0;
308 ci->i_xattrs.names_size = 0;
309 ci->i_xattrs.vals_size = 0;
310 ci->i_xattrs.version = 0;
311 ci->i_xattrs.index_version = 0;
312
313 ci->i_caps = RB_ROOT;
314 ci->i_auth_cap = NULL;
315 ci->i_dirty_caps = 0;
316 ci->i_flushing_caps = 0;
317 INIT_LIST_HEAD(&ci->i_dirty_item);
318 INIT_LIST_HEAD(&ci->i_flushing_item);
319 ci->i_cap_flush_seq = 0;
320 ci->i_cap_flush_last_tid = 0;
321 memset(&ci->i_cap_flush_tid, 0, sizeof(ci->i_cap_flush_tid));
322 init_waitqueue_head(&ci->i_cap_wq);
323 ci->i_hold_caps_min = 0;
324 ci->i_hold_caps_max = 0;
325 INIT_LIST_HEAD(&ci->i_cap_delay_list);
326 ci->i_cap_exporting_mds = 0;
327 ci->i_cap_exporting_mseq = 0;
328 ci->i_cap_exporting_issued = 0;
329 INIT_LIST_HEAD(&ci->i_cap_snaps);
330 ci->i_head_snapc = NULL;
331 ci->i_snap_caps = 0;
332
333 for (i = 0; i < CEPH_FILE_MODE_NUM; i++)
334 ci->i_nr_by_mode[i] = 0;
335
336 ci->i_truncate_seq = 0;
337 ci->i_truncate_size = 0;
338 ci->i_truncate_pending = 0;
339
340 ci->i_max_size = 0;
341 ci->i_reported_size = 0;
342 ci->i_wanted_max_size = 0;
343 ci->i_requested_max_size = 0;
344
345 ci->i_pin_ref = 0;
346 ci->i_rd_ref = 0;
347 ci->i_rdcache_ref = 0;
348 ci->i_wr_ref = 0;
349 ci->i_wrbuffer_ref = 0;
350 ci->i_wrbuffer_ref_head = 0;
351 ci->i_shared_gen = 0;
352 ci->i_rdcache_gen = 0;
353 ci->i_rdcache_revoking = 0;
354
355 INIT_LIST_HEAD(&ci->i_unsafe_writes);
356 INIT_LIST_HEAD(&ci->i_unsafe_dirops);
357 spin_lock_init(&ci->i_unsafe_lock);
358
359 ci->i_snap_realm = NULL;
360 INIT_LIST_HEAD(&ci->i_snap_realm_item);
361 INIT_LIST_HEAD(&ci->i_snap_flush_item);
362
363 INIT_WORK(&ci->i_wb_work, ceph_writeback_work);
364 INIT_WORK(&ci->i_pg_inv_work, ceph_invalidate_work);
365
366 INIT_WORK(&ci->i_vmtruncate_work, ceph_vmtruncate_work);
367
368 return &ci->vfs_inode;
369}
370
371void ceph_destroy_inode(struct inode *inode)
372{
373 struct ceph_inode_info *ci = ceph_inode(inode);
374 struct ceph_inode_frag *frag;
375 struct rb_node *n;
376
377 dout("destroy_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
378
379 ceph_queue_caps_release(inode);
380
381 kfree(ci->i_symlink);
382 while ((n = rb_first(&ci->i_fragtree)) != NULL) {
383 frag = rb_entry(n, struct ceph_inode_frag, node);
384 rb_erase(n, &ci->i_fragtree);
385 kfree(frag);
386 }
387
388 __ceph_destroy_xattrs(ci);
389 if (ci->i_xattrs.blob)
390 ceph_buffer_put(ci->i_xattrs.blob);
391 if (ci->i_xattrs.prealloc_blob)
392 ceph_buffer_put(ci->i_xattrs.prealloc_blob);
393
394 kmem_cache_free(ceph_inode_cachep, ci);
395}
396
397
398/*
399 * Helpers to fill in size, ctime, mtime, and atime. We have to be
400 * careful because either the client or MDS may have more up to date
401 * info, depending on which capabilities are held, and whether
402 * time_warp_seq or truncate_seq have increased. (Ordinarily, mtime
403 * and size are monotonically increasing, except when utimes() or
404 * truncate() increments the corresponding _seq values.)
405 */
406int ceph_fill_file_size(struct inode *inode, int issued,
407 u32 truncate_seq, u64 truncate_size, u64 size)
408{
409 struct ceph_inode_info *ci = ceph_inode(inode);
410 int queue_trunc = 0;
411
412 if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 ||
413 (truncate_seq == ci->i_truncate_seq && size > inode->i_size)) {
414 dout("size %lld -> %llu\n", inode->i_size, size);
415 inode->i_size = size;
416 inode->i_blocks = (size + (1<<9) - 1) >> 9;
417 ci->i_reported_size = size;
418 if (truncate_seq != ci->i_truncate_seq) {
419 dout("truncate_seq %u -> %u\n",
420 ci->i_truncate_seq, truncate_seq);
421 ci->i_truncate_seq = truncate_seq;
422 /*
423 * If we hold relevant caps, or in the case where we're
424 * not the only client referencing this file and we
425 * don't hold those caps, then we need to check whether
426 * the file is either opened or mmaped
427 */
428 if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_RD|
429 CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER|
430 CEPH_CAP_FILE_EXCL)) ||
431 mapping_mapped(inode->i_mapping) ||
432 __ceph_caps_file_wanted(ci)) {
433 ci->i_truncate_pending++;
434 queue_trunc = 1;
435 }
436 }
437 }
438 if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0 &&
439 ci->i_truncate_size != truncate_size) {
440 dout("truncate_size %lld -> %llu\n", ci->i_truncate_size,
441 truncate_size);
442 ci->i_truncate_size = truncate_size;
443 }
444 return queue_trunc;
445}
446
447void ceph_fill_file_time(struct inode *inode, int issued,
448 u64 time_warp_seq, struct timespec *ctime,
449 struct timespec *mtime, struct timespec *atime)
450{
451 struct ceph_inode_info *ci = ceph_inode(inode);
452 int warn = 0;
453
454 if (issued & (CEPH_CAP_FILE_EXCL|
455 CEPH_CAP_FILE_WR|
456 CEPH_CAP_FILE_BUFFER)) {
457 if (timespec_compare(ctime, &inode->i_ctime) > 0) {
458 dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n",
459 inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
460 ctime->tv_sec, ctime->tv_nsec);
461 inode->i_ctime = *ctime;
462 }
463 if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) {
464 /* the MDS did a utimes() */
465 dout("mtime %ld.%09ld -> %ld.%09ld "
466 "tw %d -> %d\n",
467 inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
468 mtime->tv_sec, mtime->tv_nsec,
469 ci->i_time_warp_seq, (int)time_warp_seq);
470
471 inode->i_mtime = *mtime;
472 inode->i_atime = *atime;
473 ci->i_time_warp_seq = time_warp_seq;
474 } else if (time_warp_seq == ci->i_time_warp_seq) {
475 /* nobody did utimes(); take the max */
476 if (timespec_compare(mtime, &inode->i_mtime) > 0) {
477 dout("mtime %ld.%09ld -> %ld.%09ld inc\n",
478 inode->i_mtime.tv_sec,
479 inode->i_mtime.tv_nsec,
480 mtime->tv_sec, mtime->tv_nsec);
481 inode->i_mtime = *mtime;
482 }
483 if (timespec_compare(atime, &inode->i_atime) > 0) {
484 dout("atime %ld.%09ld -> %ld.%09ld inc\n",
485 inode->i_atime.tv_sec,
486 inode->i_atime.tv_nsec,
487 atime->tv_sec, atime->tv_nsec);
488 inode->i_atime = *atime;
489 }
490 } else if (issued & CEPH_CAP_FILE_EXCL) {
491 /* we did a utimes(); ignore mds values */
492 } else {
493 warn = 1;
494 }
495 } else {
496 /* we have no write caps; whatever the MDS says is true */
497 if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) {
498 inode->i_ctime = *ctime;
499 inode->i_mtime = *mtime;
500 inode->i_atime = *atime;
501 ci->i_time_warp_seq = time_warp_seq;
502 } else {
503 warn = 1;
504 }
505 }
506 if (warn) /* time_warp_seq shouldn't go backwards */
507 dout("%p mds time_warp_seq %llu < %u\n",
508 inode, time_warp_seq, ci->i_time_warp_seq);
509}
510
511/*
512 * Populate an inode based on info from mds. May be called on new or
513 * existing inodes.
514 */
515static int fill_inode(struct inode *inode,
516 struct ceph_mds_reply_info_in *iinfo,
517 struct ceph_mds_reply_dirfrag *dirinfo,
518 struct ceph_mds_session *session,
519 unsigned long ttl_from, int cap_fmode,
520 struct ceph_cap_reservation *caps_reservation)
521{
522 struct ceph_mds_reply_inode *info = iinfo->in;
523 struct ceph_inode_info *ci = ceph_inode(inode);
524 int i;
525 int issued, implemented;
526 struct timespec mtime, atime, ctime;
527 u32 nsplits;
528 struct ceph_buffer *xattr_blob = NULL;
529 int err = 0;
530 int queue_trunc = 0;
531
532 dout("fill_inode %p ino %llx.%llx v %llu had %llu\n",
533 inode, ceph_vinop(inode), le64_to_cpu(info->version),
534 ci->i_version);
535
536 /*
537 * prealloc xattr data, if it looks like we'll need it. only
538 * if len > 4 (meaning there are actually xattrs; the first 4
539 * bytes are the xattr count).
540 */
541 if (iinfo->xattr_len > 4) {
542 xattr_blob = ceph_buffer_new(iinfo->xattr_len, GFP_NOFS);
543 if (!xattr_blob)
544 pr_err("fill_inode ENOMEM xattr blob %d bytes\n",
545 iinfo->xattr_len);
546 }
547
548 spin_lock(&inode->i_lock);
549
550 /*
551 * provided version will be odd if inode value is projected,
552 * even if stable. skip the update if we have a newer info
553 * (e.g., due to inode info racing form multiple MDSs), or if
554 * we are getting projected (unstable) inode info.
555 */
556 if (le64_to_cpu(info->version) > 0 &&
557 (ci->i_version & ~1) > le64_to_cpu(info->version))
558 goto no_change;
559
560 issued = __ceph_caps_issued(ci, &implemented);
561 issued |= implemented | __ceph_caps_dirty(ci);
562
563 /* update inode */
564 ci->i_version = le64_to_cpu(info->version);
565 inode->i_version++;
566 inode->i_rdev = le32_to_cpu(info->rdev);
567
568 if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
569 inode->i_mode = le32_to_cpu(info->mode);
570 inode->i_uid = le32_to_cpu(info->uid);
571 inode->i_gid = le32_to_cpu(info->gid);
572 dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
573 inode->i_uid, inode->i_gid);
574 }
575
576 if ((issued & CEPH_CAP_LINK_EXCL) == 0)
577 inode->i_nlink = le32_to_cpu(info->nlink);
578
579 /* be careful with mtime, atime, size */
580 ceph_decode_timespec(&atime, &info->atime);
581 ceph_decode_timespec(&mtime, &info->mtime);
582 ceph_decode_timespec(&ctime, &info->ctime);
583 queue_trunc = ceph_fill_file_size(inode, issued,
584 le32_to_cpu(info->truncate_seq),
585 le64_to_cpu(info->truncate_size),
586 le64_to_cpu(info->size));
587 ceph_fill_file_time(inode, issued,
588 le32_to_cpu(info->time_warp_seq),
589 &ctime, &mtime, &atime);
590
591 ci->i_max_size = le64_to_cpu(info->max_size);
592 ci->i_layout = info->layout;
593 inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
594
595 /* xattrs */
596 /* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */
597 if ((issued & CEPH_CAP_XATTR_EXCL) == 0 &&
598 le64_to_cpu(info->xattr_version) > ci->i_xattrs.version) {
599 if (ci->i_xattrs.blob)
600 ceph_buffer_put(ci->i_xattrs.blob);
601 ci->i_xattrs.blob = xattr_blob;
602 if (xattr_blob)
603 memcpy(ci->i_xattrs.blob->vec.iov_base,
604 iinfo->xattr_data, iinfo->xattr_len);
605 ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
606 }
607
608 inode->i_mapping->a_ops = &ceph_aops;
609 inode->i_mapping->backing_dev_info =
610 &ceph_client(inode->i_sb)->backing_dev_info;
611
612 switch (inode->i_mode & S_IFMT) {
613 case S_IFIFO:
614 case S_IFBLK:
615 case S_IFCHR:
616 case S_IFSOCK:
617 init_special_inode(inode, inode->i_mode, inode->i_rdev);
618 inode->i_op = &ceph_file_iops;
619 break;
620 case S_IFREG:
621 inode->i_op = &ceph_file_iops;
622 inode->i_fop = &ceph_file_fops;
623 break;
624 case S_IFLNK:
625 inode->i_op = &ceph_symlink_iops;
626 if (!ci->i_symlink) {
627 int symlen = iinfo->symlink_len;
628 char *sym;
629
630 BUG_ON(symlen != inode->i_size);
631 spin_unlock(&inode->i_lock);
632
633 err = -ENOMEM;
634 sym = kmalloc(symlen+1, GFP_NOFS);
635 if (!sym)
636 goto out;
637 memcpy(sym, iinfo->symlink, symlen);
638 sym[symlen] = 0;
639
640 spin_lock(&inode->i_lock);
641 if (!ci->i_symlink)
642 ci->i_symlink = sym;
643 else
644 kfree(sym); /* lost a race */
645 }
646 break;
647 case S_IFDIR:
648 inode->i_op = &ceph_dir_iops;
649 inode->i_fop = &ceph_dir_fops;
650
651 ci->i_files = le64_to_cpu(info->files);
652 ci->i_subdirs = le64_to_cpu(info->subdirs);
653 ci->i_rbytes = le64_to_cpu(info->rbytes);
654 ci->i_rfiles = le64_to_cpu(info->rfiles);
655 ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);
656 ceph_decode_timespec(&ci->i_rctime, &info->rctime);
657
658 /* set dir completion flag? */
659 if (ci->i_files == 0 && ci->i_subdirs == 0 &&
660 ceph_snap(inode) == CEPH_NOSNAP &&
661 (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED)) {
662 dout(" marking %p complete (empty)\n", inode);
663 ci->i_ceph_flags |= CEPH_I_COMPLETE;
664 ci->i_max_offset = 2;
665 }
666
667 /* it may be better to set st_size in getattr instead? */
668 if (ceph_test_opt(ceph_client(inode->i_sb), RBYTES))
669 inode->i_size = ci->i_rbytes;
670 break;
671 default:
672 pr_err("fill_inode %llx.%llx BAD mode 0%o\n",
673 ceph_vinop(inode), inode->i_mode);
674 }
675
676no_change:
677 spin_unlock(&inode->i_lock);
678
679 /* queue truncate if we saw i_size decrease */
680 if (queue_trunc)
681 ceph_queue_vmtruncate(inode);
682
683 /* populate frag tree */
684 /* FIXME: move me up, if/when version reflects fragtree changes */
685 nsplits = le32_to_cpu(info->fragtree.nsplits);
686 mutex_lock(&ci->i_fragtree_mutex);
687 for (i = 0; i < nsplits; i++) {
688 u32 id = le32_to_cpu(info->fragtree.splits[i].frag);
689 struct ceph_inode_frag *frag = __get_or_create_frag(ci, id);
690
691 if (IS_ERR(frag))
692 continue;
693 frag->split_by = le32_to_cpu(info->fragtree.splits[i].by);
694 dout(" frag %x split by %d\n", frag->frag, frag->split_by);
695 }
696 mutex_unlock(&ci->i_fragtree_mutex);
697
698 /* were we issued a capability? */
699 if (info->cap.caps) {
700 if (ceph_snap(inode) == CEPH_NOSNAP) {
701 ceph_add_cap(inode, session,
702 le64_to_cpu(info->cap.cap_id),
703 cap_fmode,
704 le32_to_cpu(info->cap.caps),
705 le32_to_cpu(info->cap.wanted),
706 le32_to_cpu(info->cap.seq),
707 le32_to_cpu(info->cap.mseq),
708 le64_to_cpu(info->cap.realm),
709 info->cap.flags,
710 caps_reservation);
711 } else {
712 spin_lock(&inode->i_lock);
713 dout(" %p got snap_caps %s\n", inode,
714 ceph_cap_string(le32_to_cpu(info->cap.caps)));
715 ci->i_snap_caps |= le32_to_cpu(info->cap.caps);
716 if (cap_fmode >= 0)
717 __ceph_get_fmode(ci, cap_fmode);
718 spin_unlock(&inode->i_lock);
719 }
720 }
721
722 /* update delegation info? */
723 if (dirinfo)
724 ceph_fill_dirfrag(inode, dirinfo);
725
726 err = 0;
727
728out:
729 if (xattr_blob)
730 ceph_buffer_put(xattr_blob);
731 return err;
732}
733
734/*
735 * caller should hold session s_mutex.
736 */
737static void update_dentry_lease(struct dentry *dentry,
738 struct ceph_mds_reply_lease *lease,
739 struct ceph_mds_session *session,
740 unsigned long from_time)
741{
742 struct ceph_dentry_info *di = ceph_dentry(dentry);
743 long unsigned duration = le32_to_cpu(lease->duration_ms);
744 long unsigned ttl = from_time + (duration * HZ) / 1000;
745 long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
746 struct inode *dir;
747
748 /* only track leases on regular dentries */
749 if (dentry->d_op != &ceph_dentry_ops)
750 return;
751
752 spin_lock(&dentry->d_lock);
753 dout("update_dentry_lease %p mask %d duration %lu ms ttl %lu\n",
754 dentry, le16_to_cpu(lease->mask), duration, ttl);
755
756 /* make lease_rdcache_gen match directory */
757 dir = dentry->d_parent->d_inode;
758 di->lease_shared_gen = ceph_inode(dir)->i_shared_gen;
759
760 if (lease->mask == 0)
761 goto out_unlock;
762
763 if (di->lease_gen == session->s_cap_gen &&
764 time_before(ttl, dentry->d_time))
765 goto out_unlock; /* we already have a newer lease. */
766
767 if (di->lease_session && di->lease_session != session)
768 goto out_unlock;
769
770 ceph_dentry_lru_touch(dentry);
771
772 if (!di->lease_session)
773 di->lease_session = ceph_get_mds_session(session);
774 di->lease_gen = session->s_cap_gen;
775 di->lease_seq = le32_to_cpu(lease->seq);
776 di->lease_renew_after = half_ttl;
777 di->lease_renew_from = 0;
778 dentry->d_time = ttl;
779out_unlock:
780 spin_unlock(&dentry->d_lock);
781 return;
782}
783
784/*
785 * splice a dentry to an inode.
786 * caller must hold directory i_mutex for this to be safe.
787 *
788 * we will only rehash the resulting dentry if @prehash is
789 * true; @prehash will be set to false (for the benefit of
790 * the caller) if we fail.
791 */
792static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
793 bool *prehash)
794{
795 struct dentry *realdn;
796
797 /* dn must be unhashed */
798 if (!d_unhashed(dn))
799 d_drop(dn);
800 realdn = d_materialise_unique(dn, in);
801 if (IS_ERR(realdn)) {
802 pr_err("splice_dentry error %p inode %p ino %llx.%llx\n",
803 dn, in, ceph_vinop(in));
804 if (prehash)
805 *prehash = false; /* don't rehash on error */
806 dn = realdn; /* note realdn contains the error */
807 goto out;
808 } else if (realdn) {
809 dout("dn %p (%d) spliced with %p (%d) "
810 "inode %p ino %llx.%llx\n",
811 dn, atomic_read(&dn->d_count),
812 realdn, atomic_read(&realdn->d_count),
813 realdn->d_inode, ceph_vinop(realdn->d_inode));
814 dput(dn);
815 dn = realdn;
816 } else {
817 BUG_ON(!ceph_dentry(dn));
818
819 dout("dn %p attached to %p ino %llx.%llx\n",
820 dn, dn->d_inode, ceph_vinop(dn->d_inode));
821 }
822 if ((!prehash || *prehash) && d_unhashed(dn))
823 d_rehash(dn);
824out:
825 return dn;
826}
827
828/*
829 * Set dentry's directory position based on the current dir's max, and
830 * order it in d_subdirs, so that dcache_readdir behaves.
831 */
832static void ceph_set_dentry_offset(struct dentry *dn)
833{
834 struct dentry *dir = dn->d_parent;
835 struct inode *inode = dn->d_parent->d_inode;
836 struct ceph_dentry_info *di;
837
838 BUG_ON(!inode);
839
840 di = ceph_dentry(dn);
841
842 spin_lock(&inode->i_lock);
843 di->offset = ceph_inode(inode)->i_max_offset++;
844 spin_unlock(&inode->i_lock);
845
846 spin_lock(&dcache_lock);
847 spin_lock(&dn->d_lock);
848 list_move_tail(&dir->d_subdirs, &dn->d_u.d_child);
849 dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
850 dn->d_u.d_child.prev, dn->d_u.d_child.next);
851 spin_unlock(&dn->d_lock);
852 spin_unlock(&dcache_lock);
853}
854
855/*
856 * Incorporate results into the local cache. This is either just
857 * one inode, or a directory, dentry, and possibly linked-to inode (e.g.,
858 * after a lookup).
859 *
860 * A reply may contain
861 * a directory inode along with a dentry.
862 * and/or a target inode
863 *
864 * Called with snap_rwsem (read).
865 */
866int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
867 struct ceph_mds_session *session)
868{
869 struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
870 struct inode *in = NULL;
871 struct ceph_mds_reply_inode *ininfo;
872 struct ceph_vino vino;
873 int i = 0;
874 int err = 0;
875
876 dout("fill_trace %p is_dentry %d is_target %d\n", req,
877 rinfo->head->is_dentry, rinfo->head->is_target);
878
879#if 0
880 /*
881 * Debugging hook:
882 *
883 * If we resend completed ops to a recovering mds, we get no
884 * trace. Since that is very rare, pretend this is the case
885 * to ensure the 'no trace' handlers in the callers behave.
886 *
887 * Fill in inodes unconditionally to avoid breaking cap
888 * invariants.
889 */
890 if (rinfo->head->op & CEPH_MDS_OP_WRITE) {
891 pr_info("fill_trace faking empty trace on %lld %s\n",
892 req->r_tid, ceph_mds_op_name(rinfo->head->op));
893 if (rinfo->head->is_dentry) {
894 rinfo->head->is_dentry = 0;
895 err = fill_inode(req->r_locked_dir,
896 &rinfo->diri, rinfo->dirfrag,
897 session, req->r_request_started, -1);
898 }
899 if (rinfo->head->is_target) {
900 rinfo->head->is_target = 0;
901 ininfo = rinfo->targeti.in;
902 vino.ino = le64_to_cpu(ininfo->ino);
903 vino.snap = le64_to_cpu(ininfo->snapid);
904 in = ceph_get_inode(sb, vino);
905 err = fill_inode(in, &rinfo->targeti, NULL,
906 session, req->r_request_started,
907 req->r_fmode);
908 iput(in);
909 }
910 }
911#endif
912
913 if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
914 dout("fill_trace reply is empty!\n");
915 if (rinfo->head->result == 0 && req->r_locked_dir) {
916 struct ceph_inode_info *ci =
917 ceph_inode(req->r_locked_dir);
918 dout(" clearing %p complete (empty trace)\n",
919 req->r_locked_dir);
920 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
921 ci->i_release_count++;
922 }
923 return 0;
924 }
925
926 if (rinfo->head->is_dentry) {
927 struct inode *dir = req->r_locked_dir;
928
929 err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag,
930 session, req->r_request_started, -1,
931 &req->r_caps_reservation);
932 if (err < 0)
933 return err;
934 }
935
936 if (rinfo->head->is_dentry && !req->r_aborted) {
937 /*
938 * lookup link rename : null -> possibly existing inode
939 * mknod symlink mkdir : null -> new inode
940 * unlink : linked -> null
941 */
942 struct inode *dir = req->r_locked_dir;
943 struct dentry *dn = req->r_dentry;
944 bool have_dir_cap, have_lease;
945
946 BUG_ON(!dn);
947 BUG_ON(!dir);
948 BUG_ON(dn->d_parent->d_inode != dir);
949 BUG_ON(ceph_ino(dir) !=
950 le64_to_cpu(rinfo->diri.in->ino));
951 BUG_ON(ceph_snap(dir) !=
952 le64_to_cpu(rinfo->diri.in->snapid));
953
954 /* do we have a lease on the whole dir? */
955 have_dir_cap =
956 (le32_to_cpu(rinfo->diri.in->cap.caps) &
957 CEPH_CAP_FILE_SHARED);
958
959 /* do we have a dn lease? */
960 have_lease = have_dir_cap ||
961 (le16_to_cpu(rinfo->dlease->mask) &
962 CEPH_LOCK_DN);
963
964 if (!have_lease)
965 dout("fill_trace no dentry lease or dir cap\n");
966
967 /* rename? */
968 if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) {
969 dout(" src %p '%.*s' dst %p '%.*s'\n",
970 req->r_old_dentry,
971 req->r_old_dentry->d_name.len,
972 req->r_old_dentry->d_name.name,
973 dn, dn->d_name.len, dn->d_name.name);
974 dout("fill_trace doing d_move %p -> %p\n",
975 req->r_old_dentry, dn);
976 d_move(req->r_old_dentry, dn);
977 dout(" src %p '%.*s' dst %p '%.*s'\n",
978 req->r_old_dentry,
979 req->r_old_dentry->d_name.len,
980 req->r_old_dentry->d_name.name,
981 dn, dn->d_name.len, dn->d_name.name);
982 /* ensure target dentry is invalidated, despite
983 rehashing bug in vfs_rename_dir */
984 dn->d_time = jiffies;
985 ceph_dentry(dn)->lease_shared_gen = 0;
986 /* take overwritten dentry's readdir offset */
987 ceph_dentry(req->r_old_dentry)->offset =
988 ceph_dentry(dn)->offset;
989 dn = req->r_old_dentry; /* use old_dentry */
990 in = dn->d_inode;
991 }
992
993 /* null dentry? */
994 if (!rinfo->head->is_target) {
995 dout("fill_trace null dentry\n");
996 if (dn->d_inode) {
997 dout("d_delete %p\n", dn);
998 d_delete(dn);
999 } else {
1000 dout("d_instantiate %p NULL\n", dn);
1001 d_instantiate(dn, NULL);
1002 if (have_lease && d_unhashed(dn))
1003 d_rehash(dn);
1004 update_dentry_lease(dn, rinfo->dlease,
1005 session,
1006 req->r_request_started);
1007 }
1008 goto done;
1009 }
1010
1011 /* attach proper inode */
1012 ininfo = rinfo->targeti.in;
1013 vino.ino = le64_to_cpu(ininfo->ino);
1014 vino.snap = le64_to_cpu(ininfo->snapid);
1015 if (!dn->d_inode) {
1016 in = ceph_get_inode(sb, vino);
1017 if (IS_ERR(in)) {
1018 pr_err("fill_trace bad get_inode "
1019 "%llx.%llx\n", vino.ino, vino.snap);
1020 err = PTR_ERR(in);
1021 d_delete(dn);
1022 goto done;
1023 }
1024 dn = splice_dentry(dn, in, &have_lease);
1025 if (IS_ERR(dn)) {
1026 err = PTR_ERR(dn);
1027 goto done;
1028 }
1029 req->r_dentry = dn; /* may have spliced */
1030 ceph_set_dentry_offset(dn);
1031 igrab(in);
1032 } else if (ceph_ino(in) == vino.ino &&
1033 ceph_snap(in) == vino.snap) {
1034 igrab(in);
1035 } else {
1036 dout(" %p links to %p %llx.%llx, not %llx.%llx\n",
1037 dn, in, ceph_ino(in), ceph_snap(in),
1038 vino.ino, vino.snap);
1039 have_lease = false;
1040 in = NULL;
1041 }
1042
1043 if (have_lease)
1044 update_dentry_lease(dn, rinfo->dlease, session,
1045 req->r_request_started);
1046 dout(" final dn %p\n", dn);
1047 i++;
1048 } else if (req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
1049 req->r_op == CEPH_MDS_OP_MKSNAP) {
1050 struct dentry *dn = req->r_dentry;
1051
1052 /* fill out a snapdir LOOKUPSNAP dentry */
1053 BUG_ON(!dn);
1054 BUG_ON(!req->r_locked_dir);
1055 BUG_ON(ceph_snap(req->r_locked_dir) != CEPH_SNAPDIR);
1056 ininfo = rinfo->targeti.in;
1057 vino.ino = le64_to_cpu(ininfo->ino);
1058 vino.snap = le64_to_cpu(ininfo->snapid);
1059 in = ceph_get_inode(sb, vino);
1060 if (IS_ERR(in)) {
1061 pr_err("fill_inode get_inode badness %llx.%llx\n",
1062 vino.ino, vino.snap);
1063 err = PTR_ERR(in);
1064 d_delete(dn);
1065 goto done;
1066 }
1067 dout(" linking snapped dir %p to dn %p\n", in, dn);
1068 dn = splice_dentry(dn, in, NULL);
1069 if (IS_ERR(dn)) {
1070 err = PTR_ERR(dn);
1071 goto done;
1072 }
1073 ceph_set_dentry_offset(dn);
1074 req->r_dentry = dn; /* may have spliced */
1075 igrab(in);
1076 rinfo->head->is_dentry = 1; /* fool notrace handlers */
1077 }
1078
1079 if (rinfo->head->is_target) {
1080 vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
1081 vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
1082
1083 if (in == NULL || ceph_ino(in) != vino.ino ||
1084 ceph_snap(in) != vino.snap) {
1085 in = ceph_get_inode(sb, vino);
1086 if (IS_ERR(in)) {
1087 err = PTR_ERR(in);
1088 goto done;
1089 }
1090 }
1091 req->r_target_inode = in;
1092
1093 err = fill_inode(in,
1094 &rinfo->targeti, NULL,
1095 session, req->r_request_started,
1096 (le32_to_cpu(rinfo->head->result) == 0) ?
1097 req->r_fmode : -1,
1098 &req->r_caps_reservation);
1099 if (err < 0) {
1100 pr_err("fill_inode badness %p %llx.%llx\n",
1101 in, ceph_vinop(in));
1102 goto done;
1103 }
1104 }
1105
1106done:
1107 dout("fill_trace done err=%d\n", err);
1108 return err;
1109}
1110
1111/*
1112 * Prepopulate our cache with readdir results, leases, etc.
1113 */
1114int ceph_readdir_prepopulate(struct ceph_mds_request *req,
1115 struct ceph_mds_session *session)
1116{
1117 struct dentry *parent = req->r_dentry;
1118 struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
1119 struct qstr dname;
1120 struct dentry *dn;
1121 struct inode *in;
1122 int err = 0, i;
1123 struct inode *snapdir = NULL;
1124 struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
1125 u64 frag = le32_to_cpu(rhead->args.readdir.frag);
1126 struct ceph_dentry_info *di;
1127
1128 if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
1129 snapdir = ceph_get_snapdir(parent->d_inode);
1130 parent = d_find_alias(snapdir);
1131 dout("readdir_prepopulate %d items under SNAPDIR dn %p\n",
1132 rinfo->dir_nr, parent);
1133 } else {
1134 dout("readdir_prepopulate %d items under dn %p\n",
1135 rinfo->dir_nr, parent);
1136 if (rinfo->dir_dir)
1137 ceph_fill_dirfrag(parent->d_inode, rinfo->dir_dir);
1138 }
1139
1140 for (i = 0; i < rinfo->dir_nr; i++) {
1141 struct ceph_vino vino;
1142
1143 dname.name = rinfo->dir_dname[i];
1144 dname.len = rinfo->dir_dname_len[i];
1145 dname.hash = full_name_hash(dname.name, dname.len);
1146
1147 vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino);
1148 vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid);
1149
1150retry_lookup:
1151 dn = d_lookup(parent, &dname);
1152 dout("d_lookup on parent=%p name=%.*s got %p\n",
1153 parent, dname.len, dname.name, dn);
1154
1155 if (!dn) {
1156 dn = d_alloc(parent, &dname);
1157 dout("d_alloc %p '%.*s' = %p\n", parent,
1158 dname.len, dname.name, dn);
1159 if (dn == NULL) {
1160 dout("d_alloc badness\n");
1161 err = -ENOMEM;
1162 goto out;
1163 }
1164 err = ceph_init_dentry(dn);
1165 if (err < 0)
1166 goto out;
1167 } else if (dn->d_inode &&
1168 (ceph_ino(dn->d_inode) != vino.ino ||
1169 ceph_snap(dn->d_inode) != vino.snap)) {
1170 dout(" dn %p points to wrong inode %p\n",
1171 dn, dn->d_inode);
1172 d_delete(dn);
1173 dput(dn);
1174 goto retry_lookup;
1175 } else {
1176 /* reorder parent's d_subdirs */
1177 spin_lock(&dcache_lock);
1178 spin_lock(&dn->d_lock);
1179 list_move(&dn->d_u.d_child, &parent->d_subdirs);
1180 spin_unlock(&dn->d_lock);
1181 spin_unlock(&dcache_lock);
1182 }
1183
1184 di = dn->d_fsdata;
1185 di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
1186
1187 /* inode */
1188 if (dn->d_inode) {
1189 in = dn->d_inode;
1190 } else {
1191 in = ceph_get_inode(parent->d_sb, vino);
1192 if (in == NULL) {
1193 dout("new_inode badness\n");
1194 d_delete(dn);
1195 dput(dn);
1196 err = -ENOMEM;
1197 goto out;
1198 }
1199 dn = splice_dentry(dn, in, NULL);
1200 }
1201
1202 if (fill_inode(in, &rinfo->dir_in[i], NULL, session,
1203 req->r_request_started, -1,
1204 &req->r_caps_reservation) < 0) {
1205 pr_err("fill_inode badness on %p\n", in);
1206 dput(dn);
1207 continue;
1208 }
1209 update_dentry_lease(dn, rinfo->dir_dlease[i],
1210 req->r_session, req->r_request_started);
1211 dput(dn);
1212 }
1213 req->r_did_prepopulate = true;
1214
1215out:
1216 if (snapdir) {
1217 iput(snapdir);
1218 dput(parent);
1219 }
1220 dout("readdir_prepopulate done\n");
1221 return err;
1222}
1223
1224int ceph_inode_set_size(struct inode *inode, loff_t size)
1225{
1226 struct ceph_inode_info *ci = ceph_inode(inode);
1227 int ret = 0;
1228
1229 spin_lock(&inode->i_lock);
1230 dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);
1231 inode->i_size = size;
1232 inode->i_blocks = (size + (1 << 9) - 1) >> 9;
1233
1234 /* tell the MDS if we are approaching max_size */
1235 if ((size << 1) >= ci->i_max_size &&
1236 (ci->i_reported_size << 1) < ci->i_max_size)
1237 ret = 1;
1238
1239 spin_unlock(&inode->i_lock);
1240 return ret;
1241}
1242
1243/*
1244 * Write back inode data in a worker thread. (This can't be done
1245 * in the message handler context.)
1246 */
1247void ceph_queue_writeback(struct inode *inode)
1248{
1249 if (queue_work(ceph_inode_to_client(inode)->wb_wq,
1250 &ceph_inode(inode)->i_wb_work)) {
1251 dout("ceph_queue_writeback %p\n", inode);
1252 igrab(inode);
1253 } else {
1254 dout("ceph_queue_writeback %p failed\n", inode);
1255 }
1256}
1257
1258static void ceph_writeback_work(struct work_struct *work)
1259{
1260 struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
1261 i_wb_work);
1262 struct inode *inode = &ci->vfs_inode;
1263
1264 dout("writeback %p\n", inode);
1265 filemap_fdatawrite(&inode->i_data);
1266 iput(inode);
1267}
1268
1269/*
1270 * queue an async invalidation
1271 */
1272void ceph_queue_invalidate(struct inode *inode)
1273{
1274 if (queue_work(ceph_inode_to_client(inode)->pg_inv_wq,
1275 &ceph_inode(inode)->i_pg_inv_work)) {
1276 dout("ceph_queue_invalidate %p\n", inode);
1277 igrab(inode);
1278 } else {
1279 dout("ceph_queue_invalidate %p failed\n", inode);
1280 }
1281}
1282
1283/*
1284 * invalidate any pages that are not dirty or under writeback. this
1285 * includes pages that are clean and mapped.
1286 */
1287static void ceph_invalidate_nondirty_pages(struct address_space *mapping)
1288{
1289 struct pagevec pvec;
1290 pgoff_t next = 0;
1291 int i;
1292
1293 pagevec_init(&pvec, 0);
1294 while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
1295 for (i = 0; i < pagevec_count(&pvec); i++) {
1296 struct page *page = pvec.pages[i];
1297 pgoff_t index;
1298 int skip_page =
1299 (PageDirty(page) || PageWriteback(page));
1300
1301 if (!skip_page)
1302 skip_page = !trylock_page(page);
1303
1304 /*
1305 * We really shouldn't be looking at the ->index of an
1306 * unlocked page. But we're not allowed to lock these
1307 * pages. So we rely upon nobody altering the ->index
1308 * of this (pinned-by-us) page.
1309 */
1310 index = page->index;
1311 if (index > next)
1312 next = index;
1313 next++;
1314
1315 if (skip_page)
1316 continue;
1317
1318 generic_error_remove_page(mapping, page);
1319 unlock_page(page);
1320 }
1321 pagevec_release(&pvec);
1322 cond_resched();
1323 }
1324}
1325
1326/*
1327 * Invalidate inode pages in a worker thread. (This can't be done
1328 * in the message handler context.)
1329 */
1330static void ceph_invalidate_work(struct work_struct *work)
1331{
1332 struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
1333 i_pg_inv_work);
1334 struct inode *inode = &ci->vfs_inode;
1335 u32 orig_gen;
1336 int check = 0;
1337
1338 spin_lock(&inode->i_lock);
1339 dout("invalidate_pages %p gen %d revoking %d\n", inode,
1340 ci->i_rdcache_gen, ci->i_rdcache_revoking);
1341 if (ci->i_rdcache_gen == 0 ||
1342 ci->i_rdcache_revoking != ci->i_rdcache_gen) {
1343 BUG_ON(ci->i_rdcache_revoking > ci->i_rdcache_gen);
1344 /* nevermind! */
1345 ci->i_rdcache_revoking = 0;
1346 spin_unlock(&inode->i_lock);
1347 goto out;
1348 }
1349 orig_gen = ci->i_rdcache_gen;
1350 spin_unlock(&inode->i_lock);
1351
1352 ceph_invalidate_nondirty_pages(inode->i_mapping);
1353
1354 spin_lock(&inode->i_lock);
1355 if (orig_gen == ci->i_rdcache_gen) {
1356 dout("invalidate_pages %p gen %d successful\n", inode,
1357 ci->i_rdcache_gen);
1358 ci->i_rdcache_gen = 0;
1359 ci->i_rdcache_revoking = 0;
1360 check = 1;
1361 } else {
1362 dout("invalidate_pages %p gen %d raced, gen now %d\n",
1363 inode, orig_gen, ci->i_rdcache_gen);
1364 }
1365 spin_unlock(&inode->i_lock);
1366
1367 if (check)
1368 ceph_check_caps(ci, 0, NULL);
1369out:
1370 iput(inode);
1371}
1372
1373
1374/*
1375 * called by trunc_wq; take i_mutex ourselves
1376 *
1377 * We also truncate in a separate thread as well.
1378 */
1379static void ceph_vmtruncate_work(struct work_struct *work)
1380{
1381 struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
1382 i_vmtruncate_work);
1383 struct inode *inode = &ci->vfs_inode;
1384
1385 dout("vmtruncate_work %p\n", inode);
1386 mutex_lock(&inode->i_mutex);
1387 __ceph_do_pending_vmtruncate(inode);
1388 mutex_unlock(&inode->i_mutex);
1389 iput(inode);
1390}
1391
1392/*
1393 * Queue an async vmtruncate. If we fail to queue work, we will handle
1394 * the truncation the next time we call __ceph_do_pending_vmtruncate.
1395 */
1396void ceph_queue_vmtruncate(struct inode *inode)
1397{
1398 struct ceph_inode_info *ci = ceph_inode(inode);
1399
1400 if (queue_work(ceph_client(inode->i_sb)->trunc_wq,
1401 &ci->i_vmtruncate_work)) {
1402 dout("ceph_queue_vmtruncate %p\n", inode);
1403 igrab(inode);
1404 } else {
1405 dout("ceph_queue_vmtruncate %p failed, pending=%d\n",
1406 inode, ci->i_truncate_pending);
1407 }
1408}
1409
1410/*
1411 * called with i_mutex held.
1412 *
1413 * Make sure any pending truncation is applied before doing anything
1414 * that may depend on it.
1415 */
1416void __ceph_do_pending_vmtruncate(struct inode *inode)
1417{
1418 struct ceph_inode_info *ci = ceph_inode(inode);
1419 u64 to;
1420 int wrbuffer_refs, wake = 0;
1421
1422retry:
1423 spin_lock(&inode->i_lock);
1424 if (ci->i_truncate_pending == 0) {
1425 dout("__do_pending_vmtruncate %p none pending\n", inode);
1426 spin_unlock(&inode->i_lock);
1427 return;
1428 }
1429
1430 /*
1431 * make sure any dirty snapped pages are flushed before we
1432 * possibly truncate them.. so write AND block!
1433 */
1434 if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) {
1435 dout("__do_pending_vmtruncate %p flushing snaps first\n",
1436 inode);
1437 spin_unlock(&inode->i_lock);
1438 filemap_write_and_wait_range(&inode->i_data, 0,
1439 inode->i_sb->s_maxbytes);
1440 goto retry;
1441 }
1442
1443 to = ci->i_truncate_size;
1444 wrbuffer_refs = ci->i_wrbuffer_ref;
1445 dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode,
1446 ci->i_truncate_pending, to);
1447 spin_unlock(&inode->i_lock);
1448
1449 truncate_inode_pages(inode->i_mapping, to);
1450
1451 spin_lock(&inode->i_lock);
1452 ci->i_truncate_pending--;
1453 if (ci->i_truncate_pending == 0)
1454 wake = 1;
1455 spin_unlock(&inode->i_lock);
1456
1457 if (wrbuffer_refs == 0)
1458 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
1459 if (wake)
1460 wake_up(&ci->i_cap_wq);
1461}
1462
1463
1464/*
1465 * symlinks
1466 */
1467static void *ceph_sym_follow_link(struct dentry *dentry, struct nameidata *nd)
1468{
1469 struct ceph_inode_info *ci = ceph_inode(dentry->d_inode);
1470 nd_set_link(nd, ci->i_symlink);
1471 return NULL;
1472}
1473
1474static const struct inode_operations ceph_symlink_iops = {
1475 .readlink = generic_readlink,
1476 .follow_link = ceph_sym_follow_link,
1477};
1478
1479/*
1480 * setattr
1481 */
1482int ceph_setattr(struct dentry *dentry, struct iattr *attr)
1483{
1484 struct inode *inode = dentry->d_inode;
1485 struct ceph_inode_info *ci = ceph_inode(inode);
1486 struct inode *parent_inode = dentry->d_parent->d_inode;
1487 const unsigned int ia_valid = attr->ia_valid;
1488 struct ceph_mds_request *req;
1489 struct ceph_mds_client *mdsc = &ceph_client(dentry->d_sb)->mdsc;
1490 int issued;
1491 int release = 0, dirtied = 0;
1492 int mask = 0;
1493 int err = 0;
1494
1495 if (ceph_snap(inode) != CEPH_NOSNAP)
1496 return -EROFS;
1497
1498 __ceph_do_pending_vmtruncate(inode);
1499
1500 err = inode_change_ok(inode, attr);
1501 if (err != 0)
1502 return err;
1503
1504 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR,
1505 USE_AUTH_MDS);
1506 if (IS_ERR(req))
1507 return PTR_ERR(req);
1508
1509 spin_lock(&inode->i_lock);
1510 issued = __ceph_caps_issued(ci, NULL);
1511 dout("setattr %p issued %s\n", inode, ceph_cap_string(issued));
1512
1513 if (ia_valid & ATTR_UID) {
1514 dout("setattr %p uid %d -> %d\n", inode,
1515 inode->i_uid, attr->ia_uid);
1516 if (issued & CEPH_CAP_AUTH_EXCL) {
1517 inode->i_uid = attr->ia_uid;
1518 dirtied |= CEPH_CAP_AUTH_EXCL;
1519 } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
1520 attr->ia_uid != inode->i_uid) {
1521 req->r_args.setattr.uid = cpu_to_le32(attr->ia_uid);
1522 mask |= CEPH_SETATTR_UID;
1523 release |= CEPH_CAP_AUTH_SHARED;
1524 }
1525 }
1526 if (ia_valid & ATTR_GID) {
1527 dout("setattr %p gid %d -> %d\n", inode,
1528 inode->i_gid, attr->ia_gid);
1529 if (issued & CEPH_CAP_AUTH_EXCL) {
1530 inode->i_gid = attr->ia_gid;
1531 dirtied |= CEPH_CAP_AUTH_EXCL;
1532 } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
1533 attr->ia_gid != inode->i_gid) {
1534 req->r_args.setattr.gid = cpu_to_le32(attr->ia_gid);
1535 mask |= CEPH_SETATTR_GID;
1536 release |= CEPH_CAP_AUTH_SHARED;
1537 }
1538 }
1539 if (ia_valid & ATTR_MODE) {
1540 dout("setattr %p mode 0%o -> 0%o\n", inode, inode->i_mode,
1541 attr->ia_mode);
1542 if (issued & CEPH_CAP_AUTH_EXCL) {
1543 inode->i_mode = attr->ia_mode;
1544 dirtied |= CEPH_CAP_AUTH_EXCL;
1545 } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
1546 attr->ia_mode != inode->i_mode) {
1547 req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode);
1548 mask |= CEPH_SETATTR_MODE;
1549 release |= CEPH_CAP_AUTH_SHARED;
1550 }
1551 }
1552
1553 if (ia_valid & ATTR_ATIME) {
1554 dout("setattr %p atime %ld.%ld -> %ld.%ld\n", inode,
1555 inode->i_atime.tv_sec, inode->i_atime.tv_nsec,
1556 attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec);
1557 if (issued & CEPH_CAP_FILE_EXCL) {
1558 ci->i_time_warp_seq++;
1559 inode->i_atime = attr->ia_atime;
1560 dirtied |= CEPH_CAP_FILE_EXCL;
1561 } else if ((issued & CEPH_CAP_FILE_WR) &&
1562 timespec_compare(&inode->i_atime,
1563 &attr->ia_atime) < 0) {
1564 inode->i_atime = attr->ia_atime;
1565 dirtied |= CEPH_CAP_FILE_WR;
1566 } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
1567 !timespec_equal(&inode->i_atime, &attr->ia_atime)) {
1568 ceph_encode_timespec(&req->r_args.setattr.atime,
1569 &attr->ia_atime);
1570 mask |= CEPH_SETATTR_ATIME;
1571 release |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
1572 CEPH_CAP_FILE_WR;
1573 }
1574 }
1575 if (ia_valid & ATTR_MTIME) {
1576 dout("setattr %p mtime %ld.%ld -> %ld.%ld\n", inode,
1577 inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
1578 attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec);
1579 if (issued & CEPH_CAP_FILE_EXCL) {
1580 ci->i_time_warp_seq++;
1581 inode->i_mtime = attr->ia_mtime;
1582 dirtied |= CEPH_CAP_FILE_EXCL;
1583 } else if ((issued & CEPH_CAP_FILE_WR) &&
1584 timespec_compare(&inode->i_mtime,
1585 &attr->ia_mtime) < 0) {
1586 inode->i_mtime = attr->ia_mtime;
1587 dirtied |= CEPH_CAP_FILE_WR;
1588 } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
1589 !timespec_equal(&inode->i_mtime, &attr->ia_mtime)) {
1590 ceph_encode_timespec(&req->r_args.setattr.mtime,
1591 &attr->ia_mtime);
1592 mask |= CEPH_SETATTR_MTIME;
1593 release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
1594 CEPH_CAP_FILE_WR;
1595 }
1596 }
1597 if (ia_valid & ATTR_SIZE) {
1598 dout("setattr %p size %lld -> %lld\n", inode,
1599 inode->i_size, attr->ia_size);
1600 if (attr->ia_size > inode->i_sb->s_maxbytes) {
1601 err = -EINVAL;
1602 goto out;
1603 }
1604 if ((issued & CEPH_CAP_FILE_EXCL) &&
1605 attr->ia_size > inode->i_size) {
1606 inode->i_size = attr->ia_size;
1607 inode->i_blocks =
1608 (attr->ia_size + (1 << 9) - 1) >> 9;
1609 inode->i_ctime = attr->ia_ctime;
1610 ci->i_reported_size = attr->ia_size;
1611 dirtied |= CEPH_CAP_FILE_EXCL;
1612 } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
1613 attr->ia_size != inode->i_size) {
1614 req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
1615 req->r_args.setattr.old_size =
1616 cpu_to_le64(inode->i_size);
1617 mask |= CEPH_SETATTR_SIZE;
1618 release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
1619 CEPH_CAP_FILE_WR;
1620 }
1621 }
1622
1623 /* these do nothing */
1624 if (ia_valid & ATTR_CTIME) {
1625 bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME|
1626 ATTR_MODE|ATTR_UID|ATTR_GID)) == 0;
1627 dout("setattr %p ctime %ld.%ld -> %ld.%ld (%s)\n", inode,
1628 inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
1629 attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec,
1630 only ? "ctime only" : "ignored");
1631 inode->i_ctime = attr->ia_ctime;
1632 if (only) {
1633 /*
1634 * if kernel wants to dirty ctime but nothing else,
1635 * we need to choose a cap to dirty under, or do
1636 * a almost-no-op setattr
1637 */
1638 if (issued & CEPH_CAP_AUTH_EXCL)
1639 dirtied |= CEPH_CAP_AUTH_EXCL;
1640 else if (issued & CEPH_CAP_FILE_EXCL)
1641 dirtied |= CEPH_CAP_FILE_EXCL;
1642 else if (issued & CEPH_CAP_XATTR_EXCL)
1643 dirtied |= CEPH_CAP_XATTR_EXCL;
1644 else
1645 mask |= CEPH_SETATTR_CTIME;
1646 }
1647 }
1648 if (ia_valid & ATTR_FILE)
1649 dout("setattr %p ATTR_FILE ... hrm!\n", inode);
1650
1651 if (dirtied) {
1652 __ceph_mark_dirty_caps(ci, dirtied);
1653 inode->i_ctime = CURRENT_TIME;
1654 }
1655
1656 release &= issued;
1657 spin_unlock(&inode->i_lock);
1658
1659 if (mask) {
1660 req->r_inode = igrab(inode);
1661 req->r_inode_drop = release;
1662 req->r_args.setattr.mask = cpu_to_le32(mask);
1663 req->r_num_caps = 1;
1664 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
1665 }
1666 dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err,
1667 ceph_cap_string(dirtied), mask);
1668
1669 ceph_mdsc_put_request(req);
1670 __ceph_do_pending_vmtruncate(inode);
1671 return err;
1672out:
1673 spin_unlock(&inode->i_lock);
1674 ceph_mdsc_put_request(req);
1675 return err;
1676}
1677
1678/*
1679 * Verify that we have a lease on the given mask. If not,
1680 * do a getattr against an mds.
1681 */
1682int ceph_do_getattr(struct inode *inode, int mask)
1683{
1684 struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
1685 struct ceph_mds_client *mdsc = &client->mdsc;
1686 struct ceph_mds_request *req;
1687 int err;
1688
1689 if (ceph_snap(inode) == CEPH_SNAPDIR) {
1690 dout("do_getattr inode %p SNAPDIR\n", inode);
1691 return 0;
1692 }
1693
1694 dout("do_getattr inode %p mask %s\n", inode, ceph_cap_string(mask));
1695 if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
1696 return 0;
1697
1698 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
1699 if (IS_ERR(req))
1700 return PTR_ERR(req);
1701 req->r_inode = igrab(inode);
1702 req->r_num_caps = 1;
1703 req->r_args.getattr.mask = cpu_to_le32(mask);
1704 err = ceph_mdsc_do_request(mdsc, NULL, req);
1705 ceph_mdsc_put_request(req);
1706 dout("do_getattr result=%d\n", err);
1707 return err;
1708}
1709
1710
1711/*
1712 * Check inode permissions. We verify we have a valid value for
1713 * the AUTH cap, then call the generic handler.
1714 */
1715int ceph_permission(struct inode *inode, int mask)
1716{
1717 int err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED);
1718
1719 if (!err)
1720 err = generic_permission(inode, mask, NULL);
1721 return err;
1722}
1723
1724/*
1725 * Get all attributes. Hopefully somedata we'll have a statlite()
1726 * and can limit the fields we require to be accurate.
1727 */
1728int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
1729 struct kstat *stat)
1730{
1731 struct inode *inode = dentry->d_inode;
1732 struct ceph_inode_info *ci = ceph_inode(inode);
1733 int err;
1734
1735 err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL);
1736 if (!err) {
1737 generic_fillattr(inode, stat);
1738 stat->ino = inode->i_ino;
1739 if (ceph_snap(inode) != CEPH_NOSNAP)
1740 stat->dev = ceph_snap(inode);
1741 else
1742 stat->dev = 0;
1743 if (S_ISDIR(inode->i_mode)) {
1744 stat->size = ci->i_rbytes;
1745 stat->blocks = 0;
1746 stat->blksize = 65536;
1747 }
1748 }
1749 return err;
1750}
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
new file mode 100644
index 000000000000..8a5bcae62846
--- /dev/null
+++ b/fs/ceph/ioctl.c
@@ -0,0 +1,160 @@
1#include <linux/in.h>
2
3#include "ioctl.h"
4#include "super.h"
5#include "ceph_debug.h"
6
7
8/*
9 * ioctls
10 */
11
12/*
13 * get and set the file layout
14 */
15static long ceph_ioctl_get_layout(struct file *file, void __user *arg)
16{
17 struct ceph_inode_info *ci = ceph_inode(file->f_dentry->d_inode);
18 struct ceph_ioctl_layout l;
19 int err;
20
21 err = ceph_do_getattr(file->f_dentry->d_inode, CEPH_STAT_CAP_LAYOUT);
22 if (!err) {
23 l.stripe_unit = ceph_file_layout_su(ci->i_layout);
24 l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
25 l.object_size = ceph_file_layout_object_size(ci->i_layout);
26 l.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool);
27 l.preferred_osd =
28 (s32)le32_to_cpu(ci->i_layout.fl_pg_preferred);
29 if (copy_to_user(arg, &l, sizeof(l)))
30 return -EFAULT;
31 }
32
33 return err;
34}
35
36static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
37{
38 struct inode *inode = file->f_dentry->d_inode;
39 struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
40 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
41 struct ceph_mds_request *req;
42 struct ceph_ioctl_layout l;
43 int err, i;
44
45 /* copy and validate */
46 if (copy_from_user(&l, arg, sizeof(l)))
47 return -EFAULT;
48
49 if ((l.object_size & ~PAGE_MASK) ||
50 (l.stripe_unit & ~PAGE_MASK) ||
51 !l.stripe_unit ||
52 (l.object_size &&
53 (unsigned)l.object_size % (unsigned)l.stripe_unit))
54 return -EINVAL;
55
56 /* make sure it's a valid data pool */
57 if (l.data_pool > 0) {
58 mutex_lock(&mdsc->mutex);
59 err = -EINVAL;
60 for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++)
61 if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) {
62 err = 0;
63 break;
64 }
65 mutex_unlock(&mdsc->mutex);
66 if (err)
67 return err;
68 }
69
70 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETLAYOUT,
71 USE_AUTH_MDS);
72 if (IS_ERR(req))
73 return PTR_ERR(req);
74 req->r_inode = igrab(inode);
75 req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL;
76
77 req->r_args.setlayout.layout.fl_stripe_unit =
78 cpu_to_le32(l.stripe_unit);
79 req->r_args.setlayout.layout.fl_stripe_count =
80 cpu_to_le32(l.stripe_count);
81 req->r_args.setlayout.layout.fl_object_size =
82 cpu_to_le32(l.object_size);
83 req->r_args.setlayout.layout.fl_pg_pool = cpu_to_le32(l.data_pool);
84 req->r_args.setlayout.layout.fl_pg_preferred =
85 cpu_to_le32(l.preferred_osd);
86
87 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
88 ceph_mdsc_put_request(req);
89 return err;
90}
91
92/*
93 * Return object name, size/offset information, and location (OSD
94 * number, network address) for a given file offset.
95 */
96static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
97{
98 struct ceph_ioctl_dataloc dl;
99 struct inode *inode = file->f_dentry->d_inode;
100 struct ceph_inode_info *ci = ceph_inode(inode);
101 struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc;
102 u64 len = 1, olen;
103 u64 tmp;
104 struct ceph_object_layout ol;
105 struct ceph_pg pgid;
106
107 /* copy and validate */
108 if (copy_from_user(&dl, arg, sizeof(dl)))
109 return -EFAULT;
110
111 down_read(&osdc->map_sem);
112 ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len,
113 &dl.object_no, &dl.object_offset, &olen);
114 dl.file_offset -= dl.object_offset;
115 dl.object_size = ceph_file_layout_object_size(ci->i_layout);
116 dl.block_size = ceph_file_layout_su(ci->i_layout);
117
118 /* block_offset = object_offset % block_size */
119 tmp = dl.object_offset;
120 dl.block_offset = do_div(tmp, dl.block_size);
121
122 snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
123 ceph_ino(inode), dl.object_no);
124 ceph_calc_object_layout(&ol, dl.object_name, &ci->i_layout,
125 osdc->osdmap);
126
127 pgid = ol.ol_pgid;
128 dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
129 if (dl.osd >= 0) {
130 struct ceph_entity_addr *a =
131 ceph_osd_addr(osdc->osdmap, dl.osd);
132 if (a)
133 memcpy(&dl.osd_addr, &a->in_addr, sizeof(dl.osd_addr));
134 } else {
135 memset(&dl.osd_addr, 0, sizeof(dl.osd_addr));
136 }
137 up_read(&osdc->map_sem);
138
139 /* send result back to user */
140 if (copy_to_user(arg, &dl, sizeof(dl)))
141 return -EFAULT;
142
143 return 0;
144}
145
146long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
147{
148 dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg);
149 switch (cmd) {
150 case CEPH_IOC_GET_LAYOUT:
151 return ceph_ioctl_get_layout(file, (void __user *)arg);
152
153 case CEPH_IOC_SET_LAYOUT:
154 return ceph_ioctl_set_layout(file, (void __user *)arg);
155
156 case CEPH_IOC_GET_DATALOC:
157 return ceph_ioctl_get_dataloc(file, (void __user *)arg);
158 }
159 return -ENOTTY;
160}
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
new file mode 100644
index 000000000000..25e4f1a9d059
--- /dev/null
+++ b/fs/ceph/ioctl.h
@@ -0,0 +1,40 @@
1#ifndef FS_CEPH_IOCTL_H
2#define FS_CEPH_IOCTL_H
3
4#include <linux/ioctl.h>
5#include <linux/types.h>
6
7#define CEPH_IOCTL_MAGIC 0x97
8
9/* just use u64 to align sanely on all archs */
10struct ceph_ioctl_layout {
11 __u64 stripe_unit, stripe_count, object_size;
12 __u64 data_pool;
13 __s64 preferred_osd;
14};
15
16#define CEPH_IOC_GET_LAYOUT _IOR(CEPH_IOCTL_MAGIC, 1, \
17 struct ceph_ioctl_layout)
18#define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2, \
19 struct ceph_ioctl_layout)
20
21/*
22 * Extract identity, address of the OSD and object storing a given
23 * file offset.
24 */
25struct ceph_ioctl_dataloc {
26 __u64 file_offset; /* in+out: file offset */
27 __u64 object_offset; /* out: offset in object */
28 __u64 object_no; /* out: object # */
29 __u64 object_size; /* out: object size */
30 char object_name[64]; /* out: object name */
31 __u64 block_offset; /* out: offset in block */
32 __u64 block_size; /* out: block length */
33 __s64 osd; /* out: osd # */
34 struct sockaddr_storage osd_addr; /* out: osd address */
35};
36
37#define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3, \
38 struct ceph_ioctl_dataloc)
39
40#endif
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
new file mode 100644
index 000000000000..a2600101ec22
--- /dev/null
+++ b/fs/ceph/mds_client.c
@@ -0,0 +1,3021 @@
1#include "ceph_debug.h"
2
3#include <linux/wait.h>
4#include <linux/sched.h>
5
6#include "mds_client.h"
7#include "mon_client.h"
8#include "super.h"
9#include "messenger.h"
10#include "decode.h"
11#include "auth.h"
12#include "pagelist.h"
13
14/*
15 * A cluster of MDS (metadata server) daemons is responsible for
16 * managing the file system namespace (the directory hierarchy and
17 * inodes) and for coordinating shared access to storage. Metadata is
18 * partitioning hierarchically across a number of servers, and that
19 * partition varies over time as the cluster adjusts the distribution
20 * in order to balance load.
21 *
22 * The MDS client is primarily responsible to managing synchronous
23 * metadata requests for operations like open, unlink, and so forth.
24 * If there is a MDS failure, we find out about it when we (possibly
25 * request and) receive a new MDS map, and can resubmit affected
26 * requests.
27 *
28 * For the most part, though, we take advantage of a lossless
29 * communications channel to the MDS, and do not need to worry about
30 * timing out or resubmitting requests.
31 *
32 * We maintain a stateful "session" with each MDS we interact with.
33 * Within each session, we sent periodic heartbeat messages to ensure
34 * any capabilities or leases we have been issues remain valid. If
35 * the session times out and goes stale, our leases and capabilities
36 * are no longer valid.
37 */
38
39static void __wake_requests(struct ceph_mds_client *mdsc,
40 struct list_head *head);
41
42const static struct ceph_connection_operations mds_con_ops;
43
44
45/*
46 * mds reply parsing
47 */
48
49/*
50 * parse individual inode info
51 */
52static int parse_reply_info_in(void **p, void *end,
53 struct ceph_mds_reply_info_in *info)
54{
55 int err = -EIO;
56
57 info->in = *p;
58 *p += sizeof(struct ceph_mds_reply_inode) +
59 sizeof(*info->in->fragtree.splits) *
60 le32_to_cpu(info->in->fragtree.nsplits);
61
62 ceph_decode_32_safe(p, end, info->symlink_len, bad);
63 ceph_decode_need(p, end, info->symlink_len, bad);
64 info->symlink = *p;
65 *p += info->symlink_len;
66
67 ceph_decode_32_safe(p, end, info->xattr_len, bad);
68 ceph_decode_need(p, end, info->xattr_len, bad);
69 info->xattr_data = *p;
70 *p += info->xattr_len;
71 return 0;
72bad:
73 return err;
74}
75
76/*
77 * parse a normal reply, which may contain a (dir+)dentry and/or a
78 * target inode.
79 */
80static int parse_reply_info_trace(void **p, void *end,
81 struct ceph_mds_reply_info_parsed *info)
82{
83 int err;
84
85 if (info->head->is_dentry) {
86 err = parse_reply_info_in(p, end, &info->diri);
87 if (err < 0)
88 goto out_bad;
89
90 if (unlikely(*p + sizeof(*info->dirfrag) > end))
91 goto bad;
92 info->dirfrag = *p;
93 *p += sizeof(*info->dirfrag) +
94 sizeof(u32)*le32_to_cpu(info->dirfrag->ndist);
95 if (unlikely(*p > end))
96 goto bad;
97
98 ceph_decode_32_safe(p, end, info->dname_len, bad);
99 ceph_decode_need(p, end, info->dname_len, bad);
100 info->dname = *p;
101 *p += info->dname_len;
102 info->dlease = *p;
103 *p += sizeof(*info->dlease);
104 }
105
106 if (info->head->is_target) {
107 err = parse_reply_info_in(p, end, &info->targeti);
108 if (err < 0)
109 goto out_bad;
110 }
111
112 if (unlikely(*p != end))
113 goto bad;
114 return 0;
115
116bad:
117 err = -EIO;
118out_bad:
119 pr_err("problem parsing mds trace %d\n", err);
120 return err;
121}
122
123/*
124 * parse readdir results
125 */
126static int parse_reply_info_dir(void **p, void *end,
127 struct ceph_mds_reply_info_parsed *info)
128{
129 u32 num, i = 0;
130 int err;
131
132 info->dir_dir = *p;
133 if (*p + sizeof(*info->dir_dir) > end)
134 goto bad;
135 *p += sizeof(*info->dir_dir) +
136 sizeof(u32)*le32_to_cpu(info->dir_dir->ndist);
137 if (*p > end)
138 goto bad;
139
140 ceph_decode_need(p, end, sizeof(num) + 2, bad);
141 num = ceph_decode_32(p);
142 info->dir_end = ceph_decode_8(p);
143 info->dir_complete = ceph_decode_8(p);
144 if (num == 0)
145 goto done;
146
147 /* alloc large array */
148 info->dir_nr = num;
149 info->dir_in = kcalloc(num, sizeof(*info->dir_in) +
150 sizeof(*info->dir_dname) +
151 sizeof(*info->dir_dname_len) +
152 sizeof(*info->dir_dlease),
153 GFP_NOFS);
154 if (info->dir_in == NULL) {
155 err = -ENOMEM;
156 goto out_bad;
157 }
158 info->dir_dname = (void *)(info->dir_in + num);
159 info->dir_dname_len = (void *)(info->dir_dname + num);
160 info->dir_dlease = (void *)(info->dir_dname_len + num);
161
162 while (num) {
163 /* dentry */
164 ceph_decode_need(p, end, sizeof(u32)*2, bad);
165 info->dir_dname_len[i] = ceph_decode_32(p);
166 ceph_decode_need(p, end, info->dir_dname_len[i], bad);
167 info->dir_dname[i] = *p;
168 *p += info->dir_dname_len[i];
169 dout("parsed dir dname '%.*s'\n", info->dir_dname_len[i],
170 info->dir_dname[i]);
171 info->dir_dlease[i] = *p;
172 *p += sizeof(struct ceph_mds_reply_lease);
173
174 /* inode */
175 err = parse_reply_info_in(p, end, &info->dir_in[i]);
176 if (err < 0)
177 goto out_bad;
178 i++;
179 num--;
180 }
181
182done:
183 if (*p != end)
184 goto bad;
185 return 0;
186
187bad:
188 err = -EIO;
189out_bad:
190 pr_err("problem parsing dir contents %d\n", err);
191 return err;
192}
193
194/*
195 * parse entire mds reply
196 */
197static int parse_reply_info(struct ceph_msg *msg,
198 struct ceph_mds_reply_info_parsed *info)
199{
200 void *p, *end;
201 u32 len;
202 int err;
203
204 info->head = msg->front.iov_base;
205 p = msg->front.iov_base + sizeof(struct ceph_mds_reply_head);
206 end = p + msg->front.iov_len - sizeof(struct ceph_mds_reply_head);
207
208 /* trace */
209 ceph_decode_32_safe(&p, end, len, bad);
210 if (len > 0) {
211 err = parse_reply_info_trace(&p, p+len, info);
212 if (err < 0)
213 goto out_bad;
214 }
215
216 /* dir content */
217 ceph_decode_32_safe(&p, end, len, bad);
218 if (len > 0) {
219 err = parse_reply_info_dir(&p, p+len, info);
220 if (err < 0)
221 goto out_bad;
222 }
223
224 /* snap blob */
225 ceph_decode_32_safe(&p, end, len, bad);
226 info->snapblob_len = len;
227 info->snapblob = p;
228 p += len;
229
230 if (p != end)
231 goto bad;
232 return 0;
233
234bad:
235 err = -EIO;
236out_bad:
237 pr_err("mds parse_reply err %d\n", err);
238 return err;
239}
240
241static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
242{
243 kfree(info->dir_in);
244}
245
246
247/*
248 * sessions
249 */
250static const char *session_state_name(int s)
251{
252 switch (s) {
253 case CEPH_MDS_SESSION_NEW: return "new";
254 case CEPH_MDS_SESSION_OPENING: return "opening";
255 case CEPH_MDS_SESSION_OPEN: return "open";
256 case CEPH_MDS_SESSION_HUNG: return "hung";
257 case CEPH_MDS_SESSION_CLOSING: return "closing";
258 case CEPH_MDS_SESSION_RESTARTING: return "restarting";
259 case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting";
260 default: return "???";
261 }
262}
263
264static struct ceph_mds_session *get_session(struct ceph_mds_session *s)
265{
266 if (atomic_inc_not_zero(&s->s_ref)) {
267 dout("mdsc get_session %p %d -> %d\n", s,
268 atomic_read(&s->s_ref)-1, atomic_read(&s->s_ref));
269 return s;
270 } else {
271 dout("mdsc get_session %p 0 -- FAIL", s);
272 return NULL;
273 }
274}
275
276void ceph_put_mds_session(struct ceph_mds_session *s)
277{
278 dout("mdsc put_session %p %d -> %d\n", s,
279 atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
280 if (atomic_dec_and_test(&s->s_ref)) {
281 if (s->s_authorizer)
282 s->s_mdsc->client->monc.auth->ops->destroy_authorizer(
283 s->s_mdsc->client->monc.auth, s->s_authorizer);
284 kfree(s);
285 }
286}
287
288/*
289 * called under mdsc->mutex
290 */
291struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc,
292 int mds)
293{
294 struct ceph_mds_session *session;
295
296 if (mds >= mdsc->max_sessions || mdsc->sessions[mds] == NULL)
297 return NULL;
298 session = mdsc->sessions[mds];
299 dout("lookup_mds_session %p %d\n", session,
300 atomic_read(&session->s_ref));
301 get_session(session);
302 return session;
303}
304
305static bool __have_session(struct ceph_mds_client *mdsc, int mds)
306{
307 if (mds >= mdsc->max_sessions)
308 return false;
309 return mdsc->sessions[mds];
310}
311
312static int __verify_registered_session(struct ceph_mds_client *mdsc,
313 struct ceph_mds_session *s)
314{
315 if (s->s_mds >= mdsc->max_sessions ||
316 mdsc->sessions[s->s_mds] != s)
317 return -ENOENT;
318 return 0;
319}
320
321/*
322 * create+register a new session for given mds.
323 * called under mdsc->mutex.
324 */
325static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
326 int mds)
327{
328 struct ceph_mds_session *s;
329
330 s = kzalloc(sizeof(*s), GFP_NOFS);
331 s->s_mdsc = mdsc;
332 s->s_mds = mds;
333 s->s_state = CEPH_MDS_SESSION_NEW;
334 s->s_ttl = 0;
335 s->s_seq = 0;
336 mutex_init(&s->s_mutex);
337
338 ceph_con_init(mdsc->client->msgr, &s->s_con);
339 s->s_con.private = s;
340 s->s_con.ops = &mds_con_ops;
341 s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS;
342 s->s_con.peer_name.num = cpu_to_le64(mds);
343
344 spin_lock_init(&s->s_cap_lock);
345 s->s_cap_gen = 0;
346 s->s_cap_ttl = 0;
347 s->s_renew_requested = 0;
348 s->s_renew_seq = 0;
349 INIT_LIST_HEAD(&s->s_caps);
350 s->s_nr_caps = 0;
351 s->s_trim_caps = 0;
352 atomic_set(&s->s_ref, 1);
353 INIT_LIST_HEAD(&s->s_waiting);
354 INIT_LIST_HEAD(&s->s_unsafe);
355 s->s_num_cap_releases = 0;
356 s->s_cap_iterator = NULL;
357 INIT_LIST_HEAD(&s->s_cap_releases);
358 INIT_LIST_HEAD(&s->s_cap_releases_done);
359 INIT_LIST_HEAD(&s->s_cap_flushing);
360 INIT_LIST_HEAD(&s->s_cap_snaps_flushing);
361
362 dout("register_session mds%d\n", mds);
363 if (mds >= mdsc->max_sessions) {
364 int newmax = 1 << get_count_order(mds+1);
365 struct ceph_mds_session **sa;
366
367 dout("register_session realloc to %d\n", newmax);
368 sa = kcalloc(newmax, sizeof(void *), GFP_NOFS);
369 if (sa == NULL)
370 goto fail_realloc;
371 if (mdsc->sessions) {
372 memcpy(sa, mdsc->sessions,
373 mdsc->max_sessions * sizeof(void *));
374 kfree(mdsc->sessions);
375 }
376 mdsc->sessions = sa;
377 mdsc->max_sessions = newmax;
378 }
379 mdsc->sessions[mds] = s;
380 atomic_inc(&s->s_ref); /* one ref to sessions[], one to caller */
381
382 ceph_con_open(&s->s_con, ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
383
384 return s;
385
386fail_realloc:
387 kfree(s);
388 return ERR_PTR(-ENOMEM);
389}
390
391/*
392 * called under mdsc->mutex
393 */
394static void __unregister_session(struct ceph_mds_client *mdsc,
395 struct ceph_mds_session *s)
396{
397 dout("__unregister_session mds%d %p\n", s->s_mds, s);
398 BUG_ON(mdsc->sessions[s->s_mds] != s);
399 mdsc->sessions[s->s_mds] = NULL;
400 ceph_con_close(&s->s_con);
401 ceph_put_mds_session(s);
402}
403
404/*
405 * drop session refs in request.
406 *
407 * should be last request ref, or hold mdsc->mutex
408 */
409static void put_request_session(struct ceph_mds_request *req)
410{
411 if (req->r_session) {
412 ceph_put_mds_session(req->r_session);
413 req->r_session = NULL;
414 }
415}
416
417void ceph_mdsc_release_request(struct kref *kref)
418{
419 struct ceph_mds_request *req = container_of(kref,
420 struct ceph_mds_request,
421 r_kref);
422 if (req->r_request)
423 ceph_msg_put(req->r_request);
424 if (req->r_reply) {
425 ceph_msg_put(req->r_reply);
426 destroy_reply_info(&req->r_reply_info);
427 }
428 if (req->r_inode) {
429 ceph_put_cap_refs(ceph_inode(req->r_inode),
430 CEPH_CAP_PIN);
431 iput(req->r_inode);
432 }
433 if (req->r_locked_dir)
434 ceph_put_cap_refs(ceph_inode(req->r_locked_dir),
435 CEPH_CAP_PIN);
436 if (req->r_target_inode)
437 iput(req->r_target_inode);
438 if (req->r_dentry)
439 dput(req->r_dentry);
440 if (req->r_old_dentry) {
441 ceph_put_cap_refs(
442 ceph_inode(req->r_old_dentry->d_parent->d_inode),
443 CEPH_CAP_PIN);
444 dput(req->r_old_dentry);
445 }
446 kfree(req->r_path1);
447 kfree(req->r_path2);
448 put_request_session(req);
449 ceph_unreserve_caps(&req->r_caps_reservation);
450 kfree(req);
451}
452
453/*
454 * lookup session, bump ref if found.
455 *
456 * called under mdsc->mutex.
457 */
458static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc,
459 u64 tid)
460{
461 struct ceph_mds_request *req;
462 struct rb_node *n = mdsc->request_tree.rb_node;
463
464 while (n) {
465 req = rb_entry(n, struct ceph_mds_request, r_node);
466 if (tid < req->r_tid)
467 n = n->rb_left;
468 else if (tid > req->r_tid)
469 n = n->rb_right;
470 else {
471 ceph_mdsc_get_request(req);
472 return req;
473 }
474 }
475 return NULL;
476}
477
478static void __insert_request(struct ceph_mds_client *mdsc,
479 struct ceph_mds_request *new)
480{
481 struct rb_node **p = &mdsc->request_tree.rb_node;
482 struct rb_node *parent = NULL;
483 struct ceph_mds_request *req = NULL;
484
485 while (*p) {
486 parent = *p;
487 req = rb_entry(parent, struct ceph_mds_request, r_node);
488 if (new->r_tid < req->r_tid)
489 p = &(*p)->rb_left;
490 else if (new->r_tid > req->r_tid)
491 p = &(*p)->rb_right;
492 else
493 BUG();
494 }
495
496 rb_link_node(&new->r_node, parent, p);
497 rb_insert_color(&new->r_node, &mdsc->request_tree);
498}
499
500/*
501 * Register an in-flight request, and assign a tid. Link to directory
502 * are modifying (if any).
503 *
504 * Called under mdsc->mutex.
505 */
506static void __register_request(struct ceph_mds_client *mdsc,
507 struct ceph_mds_request *req,
508 struct inode *dir)
509{
510 req->r_tid = ++mdsc->last_tid;
511 if (req->r_num_caps)
512 ceph_reserve_caps(&req->r_caps_reservation, req->r_num_caps);
513 dout("__register_request %p tid %lld\n", req, req->r_tid);
514 ceph_mdsc_get_request(req);
515 __insert_request(mdsc, req);
516
517 if (dir) {
518 struct ceph_inode_info *ci = ceph_inode(dir);
519
520 spin_lock(&ci->i_unsafe_lock);
521 req->r_unsafe_dir = dir;
522 list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);
523 spin_unlock(&ci->i_unsafe_lock);
524 }
525}
526
527static void __unregister_request(struct ceph_mds_client *mdsc,
528 struct ceph_mds_request *req)
529{
530 dout("__unregister_request %p tid %lld\n", req, req->r_tid);
531 rb_erase(&req->r_node, &mdsc->request_tree);
532 ceph_mdsc_put_request(req);
533
534 if (req->r_unsafe_dir) {
535 struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
536
537 spin_lock(&ci->i_unsafe_lock);
538 list_del_init(&req->r_unsafe_dir_item);
539 spin_unlock(&ci->i_unsafe_lock);
540 }
541}
542
543/*
544 * Choose mds to send request to next. If there is a hint set in the
545 * request (e.g., due to a prior forward hint from the mds), use that.
546 * Otherwise, consult frag tree and/or caps to identify the
547 * appropriate mds. If all else fails, choose randomly.
548 *
549 * Called under mdsc->mutex.
550 */
551static int __choose_mds(struct ceph_mds_client *mdsc,
552 struct ceph_mds_request *req)
553{
554 struct inode *inode;
555 struct ceph_inode_info *ci;
556 struct ceph_cap *cap;
557 int mode = req->r_direct_mode;
558 int mds = -1;
559 u32 hash = req->r_direct_hash;
560 bool is_hash = req->r_direct_is_hash;
561
562 /*
563 * is there a specific mds we should try? ignore hint if we have
564 * no session and the mds is not up (active or recovering).
565 */
566 if (req->r_resend_mds >= 0 &&
567 (__have_session(mdsc, req->r_resend_mds) ||
568 ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) {
569 dout("choose_mds using resend_mds mds%d\n",
570 req->r_resend_mds);
571 return req->r_resend_mds;
572 }
573
574 if (mode == USE_RANDOM_MDS)
575 goto random;
576
577 inode = NULL;
578 if (req->r_inode) {
579 inode = req->r_inode;
580 } else if (req->r_dentry) {
581 if (req->r_dentry->d_inode) {
582 inode = req->r_dentry->d_inode;
583 } else {
584 inode = req->r_dentry->d_parent->d_inode;
585 hash = req->r_dentry->d_name.hash;
586 is_hash = true;
587 }
588 }
589 dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
590 (int)hash, mode);
591 if (!inode)
592 goto random;
593 ci = ceph_inode(inode);
594
595 if (is_hash && S_ISDIR(inode->i_mode)) {
596 struct ceph_inode_frag frag;
597 int found;
598
599 ceph_choose_frag(ci, hash, &frag, &found);
600 if (found) {
601 if (mode == USE_ANY_MDS && frag.ndist > 0) {
602 u8 r;
603
604 /* choose a random replica */
605 get_random_bytes(&r, 1);
606 r %= frag.ndist;
607 mds = frag.dist[r];
608 dout("choose_mds %p %llx.%llx "
609 "frag %u mds%d (%d/%d)\n",
610 inode, ceph_vinop(inode),
611 frag.frag, frag.mds,
612 (int)r, frag.ndist);
613 return mds;
614 }
615
616 /* since this file/dir wasn't known to be
617 * replicated, then we want to look for the
618 * authoritative mds. */
619 mode = USE_AUTH_MDS;
620 if (frag.mds >= 0) {
621 /* choose auth mds */
622 mds = frag.mds;
623 dout("choose_mds %p %llx.%llx "
624 "frag %u mds%d (auth)\n",
625 inode, ceph_vinop(inode), frag.frag, mds);
626 return mds;
627 }
628 }
629 }
630
631 spin_lock(&inode->i_lock);
632 cap = NULL;
633 if (mode == USE_AUTH_MDS)
634 cap = ci->i_auth_cap;
635 if (!cap && !RB_EMPTY_ROOT(&ci->i_caps))
636 cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
637 if (!cap) {
638 spin_unlock(&inode->i_lock);
639 goto random;
640 }
641 mds = cap->session->s_mds;
642 dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n",
643 inode, ceph_vinop(inode), mds,
644 cap == ci->i_auth_cap ? "auth " : "", cap);
645 spin_unlock(&inode->i_lock);
646 return mds;
647
648random:
649 mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap);
650 dout("choose_mds chose random mds%d\n", mds);
651 return mds;
652}
653
654
655/*
656 * session messages
657 */
658static struct ceph_msg *create_session_msg(u32 op, u64 seq)
659{
660 struct ceph_msg *msg;
661 struct ceph_mds_session_head *h;
662
663 msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), 0, 0, NULL);
664 if (IS_ERR(msg)) {
665 pr_err("create_session_msg ENOMEM creating msg\n");
666 return ERR_PTR(PTR_ERR(msg));
667 }
668 h = msg->front.iov_base;
669 h->op = cpu_to_le32(op);
670 h->seq = cpu_to_le64(seq);
671 return msg;
672}
673
674/*
675 * send session open request.
676 *
677 * called under mdsc->mutex
678 */
679static int __open_session(struct ceph_mds_client *mdsc,
680 struct ceph_mds_session *session)
681{
682 struct ceph_msg *msg;
683 int mstate;
684 int mds = session->s_mds;
685 int err = 0;
686
687 /* wait for mds to go active? */
688 mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
689 dout("open_session to mds%d (%s)\n", mds,
690 ceph_mds_state_name(mstate));
691 session->s_state = CEPH_MDS_SESSION_OPENING;
692 session->s_renew_requested = jiffies;
693
694 /* send connect message */
695 msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq);
696 if (IS_ERR(msg)) {
697 err = PTR_ERR(msg);
698 goto out;
699 }
700 ceph_con_send(&session->s_con, msg);
701
702out:
703 return 0;
704}
705
706/*
707 * session caps
708 */
709
710/*
711 * Free preallocated cap messages assigned to this session
712 */
713static void cleanup_cap_releases(struct ceph_mds_session *session)
714{
715 struct ceph_msg *msg;
716
717 spin_lock(&session->s_cap_lock);
718 while (!list_empty(&session->s_cap_releases)) {
719 msg = list_first_entry(&session->s_cap_releases,
720 struct ceph_msg, list_head);
721 list_del_init(&msg->list_head);
722 ceph_msg_put(msg);
723 }
724 while (!list_empty(&session->s_cap_releases_done)) {
725 msg = list_first_entry(&session->s_cap_releases_done,
726 struct ceph_msg, list_head);
727 list_del_init(&msg->list_head);
728 ceph_msg_put(msg);
729 }
730 spin_unlock(&session->s_cap_lock);
731}
732
733/*
734 * Helper to safely iterate over all caps associated with a session.
735 *
736 * caller must hold session s_mutex
737 */
738static int iterate_session_caps(struct ceph_mds_session *session,
739 int (*cb)(struct inode *, struct ceph_cap *,
740 void *), void *arg)
741{
742 struct list_head *p;
743 struct ceph_cap *cap;
744 struct inode *inode, *last_inode = NULL;
745 struct ceph_cap *old_cap = NULL;
746 int ret;
747
748 dout("iterate_session_caps %p mds%d\n", session, session->s_mds);
749 spin_lock(&session->s_cap_lock);
750 p = session->s_caps.next;
751 while (p != &session->s_caps) {
752 cap = list_entry(p, struct ceph_cap, session_caps);
753 inode = igrab(&cap->ci->vfs_inode);
754 if (!inode) {
755 p = p->next;
756 continue;
757 }
758 session->s_cap_iterator = cap;
759 spin_unlock(&session->s_cap_lock);
760
761 if (last_inode) {
762 iput(last_inode);
763 last_inode = NULL;
764 }
765 if (old_cap) {
766 ceph_put_cap(old_cap);
767 old_cap = NULL;
768 }
769
770 ret = cb(inode, cap, arg);
771 last_inode = inode;
772
773 spin_lock(&session->s_cap_lock);
774 p = p->next;
775 if (cap->ci == NULL) {
776 dout("iterate_session_caps finishing cap %p removal\n",
777 cap);
778 BUG_ON(cap->session != session);
779 list_del_init(&cap->session_caps);
780 session->s_nr_caps--;
781 cap->session = NULL;
782 old_cap = cap; /* put_cap it w/o locks held */
783 }
784 if (ret < 0)
785 goto out;
786 }
787 ret = 0;
788out:
789 session->s_cap_iterator = NULL;
790 spin_unlock(&session->s_cap_lock);
791
792 if (last_inode)
793 iput(last_inode);
794 if (old_cap)
795 ceph_put_cap(old_cap);
796
797 return ret;
798}
799
800static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
801 void *arg)
802{
803 struct ceph_inode_info *ci = ceph_inode(inode);
804 dout("removing cap %p, ci is %p, inode is %p\n",
805 cap, ci, &ci->vfs_inode);
806 ceph_remove_cap(cap);
807 return 0;
808}
809
810/*
811 * caller must hold session s_mutex
812 */
813static void remove_session_caps(struct ceph_mds_session *session)
814{
815 dout("remove_session_caps on %p\n", session);
816 iterate_session_caps(session, remove_session_caps_cb, NULL);
817 BUG_ON(session->s_nr_caps > 0);
818 cleanup_cap_releases(session);
819}
820
821/*
822 * wake up any threads waiting on this session's caps. if the cap is
823 * old (didn't get renewed on the client reconnect), remove it now.
824 *
825 * caller must hold s_mutex.
826 */
827static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
828 void *arg)
829{
830 struct ceph_inode_info *ci = ceph_inode(inode);
831
832 wake_up(&ci->i_cap_wq);
833 if (arg) {
834 spin_lock(&inode->i_lock);
835 ci->i_wanted_max_size = 0;
836 ci->i_requested_max_size = 0;
837 spin_unlock(&inode->i_lock);
838 }
839 return 0;
840}
841
842static void wake_up_session_caps(struct ceph_mds_session *session,
843 int reconnect)
844{
845 dout("wake_up_session_caps %p mds%d\n", session, session->s_mds);
846 iterate_session_caps(session, wake_up_session_cb,
847 (void *)(unsigned long)reconnect);
848}
849
850/*
851 * Send periodic message to MDS renewing all currently held caps. The
852 * ack will reset the expiration for all caps from this session.
853 *
854 * caller holds s_mutex
855 */
856static int send_renew_caps(struct ceph_mds_client *mdsc,
857 struct ceph_mds_session *session)
858{
859 struct ceph_msg *msg;
860 int state;
861
862 if (time_after_eq(jiffies, session->s_cap_ttl) &&
863 time_after_eq(session->s_cap_ttl, session->s_renew_requested))
864 pr_info("mds%d caps stale\n", session->s_mds);
865
866 /* do not try to renew caps until a recovering mds has reconnected
867 * with its clients. */
868 state = ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds);
869 if (state < CEPH_MDS_STATE_RECONNECT) {
870 dout("send_renew_caps ignoring mds%d (%s)\n",
871 session->s_mds, ceph_mds_state_name(state));
872 return 0;
873 }
874
875 dout("send_renew_caps to mds%d (%s)\n", session->s_mds,
876 ceph_mds_state_name(state));
877 session->s_renew_requested = jiffies;
878 msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
879 ++session->s_renew_seq);
880 if (IS_ERR(msg))
881 return PTR_ERR(msg);
882 ceph_con_send(&session->s_con, msg);
883 return 0;
884}
885
886/*
887 * Note new cap ttl, and any transition from stale -> not stale (fresh?).
888 *
889 * Called under session->s_mutex
890 */
891static void renewed_caps(struct ceph_mds_client *mdsc,
892 struct ceph_mds_session *session, int is_renew)
893{
894 int was_stale;
895 int wake = 0;
896
897 spin_lock(&session->s_cap_lock);
898 was_stale = is_renew && (session->s_cap_ttl == 0 ||
899 time_after_eq(jiffies, session->s_cap_ttl));
900
901 session->s_cap_ttl = session->s_renew_requested +
902 mdsc->mdsmap->m_session_timeout*HZ;
903
904 if (was_stale) {
905 if (time_before(jiffies, session->s_cap_ttl)) {
906 pr_info("mds%d caps renewed\n", session->s_mds);
907 wake = 1;
908 } else {
909 pr_info("mds%d caps still stale\n", session->s_mds);
910 }
911 }
912 dout("renewed_caps mds%d ttl now %lu, was %s, now %s\n",
913 session->s_mds, session->s_cap_ttl, was_stale ? "stale" : "fresh",
914 time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh");
915 spin_unlock(&session->s_cap_lock);
916
917 if (wake)
918 wake_up_session_caps(session, 0);
919}
920
921/*
922 * send a session close request
923 */
924static int request_close_session(struct ceph_mds_client *mdsc,
925 struct ceph_mds_session *session)
926{
927 struct ceph_msg *msg;
928 int err = 0;
929
930 dout("request_close_session mds%d state %s seq %lld\n",
931 session->s_mds, session_state_name(session->s_state),
932 session->s_seq);
933 msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
934 if (IS_ERR(msg))
935 err = PTR_ERR(msg);
936 else
937 ceph_con_send(&session->s_con, msg);
938 return err;
939}
940
941/*
942 * Called with s_mutex held.
943 */
944static int __close_session(struct ceph_mds_client *mdsc,
945 struct ceph_mds_session *session)
946{
947 if (session->s_state >= CEPH_MDS_SESSION_CLOSING)
948 return 0;
949 session->s_state = CEPH_MDS_SESSION_CLOSING;
950 return request_close_session(mdsc, session);
951}
952
953/*
954 * Trim old(er) caps.
955 *
956 * Because we can't cache an inode without one or more caps, we do
957 * this indirectly: if a cap is unused, we prune its aliases, at which
958 * point the inode will hopefully get dropped to.
959 *
960 * Yes, this is a bit sloppy. Our only real goal here is to respond to
961 * memory pressure from the MDS, though, so it needn't be perfect.
962 */
963static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
964{
965 struct ceph_mds_session *session = arg;
966 struct ceph_inode_info *ci = ceph_inode(inode);
967 int used, oissued, mine;
968
969 if (session->s_trim_caps <= 0)
970 return -1;
971
972 spin_lock(&inode->i_lock);
973 mine = cap->issued | cap->implemented;
974 used = __ceph_caps_used(ci);
975 oissued = __ceph_caps_issued_other(ci, cap);
976
977 dout("trim_caps_cb %p cap %p mine %s oissued %s used %s\n",
978 inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued),
979 ceph_cap_string(used));
980 if (ci->i_dirty_caps)
981 goto out; /* dirty caps */
982 if ((used & ~oissued) & mine)
983 goto out; /* we need these caps */
984
985 session->s_trim_caps--;
986 if (oissued) {
987 /* we aren't the only cap.. just remove us */
988 __ceph_remove_cap(cap);
989 } else {
990 /* try to drop referring dentries */
991 spin_unlock(&inode->i_lock);
992 d_prune_aliases(inode);
993 dout("trim_caps_cb %p cap %p pruned, count now %d\n",
994 inode, cap, atomic_read(&inode->i_count));
995 return 0;
996 }
997
998out:
999 spin_unlock(&inode->i_lock);
1000 return 0;
1001}
1002
1003/*
1004 * Trim session cap count down to some max number.
1005 */
1006static int trim_caps(struct ceph_mds_client *mdsc,
1007 struct ceph_mds_session *session,
1008 int max_caps)
1009{
1010 int trim_caps = session->s_nr_caps - max_caps;
1011
1012 dout("trim_caps mds%d start: %d / %d, trim %d\n",
1013 session->s_mds, session->s_nr_caps, max_caps, trim_caps);
1014 if (trim_caps > 0) {
1015 session->s_trim_caps = trim_caps;
1016 iterate_session_caps(session, trim_caps_cb, session);
1017 dout("trim_caps mds%d done: %d / %d, trimmed %d\n",
1018 session->s_mds, session->s_nr_caps, max_caps,
1019 trim_caps - session->s_trim_caps);
1020 session->s_trim_caps = 0;
1021 }
1022 return 0;
1023}
1024
1025/*
1026 * Allocate cap_release messages. If there is a partially full message
1027 * in the queue, try to allocate enough to cover it's remainder, so that
1028 * we can send it immediately.
1029 *
1030 * Called under s_mutex.
1031 */
1032static int add_cap_releases(struct ceph_mds_client *mdsc,
1033 struct ceph_mds_session *session,
1034 int extra)
1035{
1036 struct ceph_msg *msg;
1037 struct ceph_mds_cap_release *head;
1038 int err = -ENOMEM;
1039
1040 if (extra < 0)
1041 extra = mdsc->client->mount_args->cap_release_safety;
1042
1043 spin_lock(&session->s_cap_lock);
1044
1045 if (!list_empty(&session->s_cap_releases)) {
1046 msg = list_first_entry(&session->s_cap_releases,
1047 struct ceph_msg,
1048 list_head);
1049 head = msg->front.iov_base;
1050 extra += CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num);
1051 }
1052
1053 while (session->s_num_cap_releases < session->s_nr_caps + extra) {
1054 spin_unlock(&session->s_cap_lock);
1055 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
1056 0, 0, NULL);
1057 if (!msg)
1058 goto out_unlocked;
1059 dout("add_cap_releases %p msg %p now %d\n", session, msg,
1060 (int)msg->front.iov_len);
1061 head = msg->front.iov_base;
1062 head->num = cpu_to_le32(0);
1063 msg->front.iov_len = sizeof(*head);
1064 spin_lock(&session->s_cap_lock);
1065 list_add(&msg->list_head, &session->s_cap_releases);
1066 session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE;
1067 }
1068
1069 if (!list_empty(&session->s_cap_releases)) {
1070 msg = list_first_entry(&session->s_cap_releases,
1071 struct ceph_msg,
1072 list_head);
1073 head = msg->front.iov_base;
1074 if (head->num) {
1075 dout(" queueing non-full %p (%d)\n", msg,
1076 le32_to_cpu(head->num));
1077 list_move_tail(&msg->list_head,
1078 &session->s_cap_releases_done);
1079 session->s_num_cap_releases -=
1080 CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num);
1081 }
1082 }
1083 err = 0;
1084 spin_unlock(&session->s_cap_lock);
1085out_unlocked:
1086 return err;
1087}
1088
1089/*
1090 * flush all dirty inode data to disk.
1091 *
1092 * returns true if we've flushed through want_flush_seq
1093 */
1094static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
1095{
1096 int mds, ret = 1;
1097
1098 dout("check_cap_flush want %lld\n", want_flush_seq);
1099 mutex_lock(&mdsc->mutex);
1100 for (mds = 0; ret && mds < mdsc->max_sessions; mds++) {
1101 struct ceph_mds_session *session = mdsc->sessions[mds];
1102
1103 if (!session)
1104 continue;
1105 get_session(session);
1106 mutex_unlock(&mdsc->mutex);
1107
1108 mutex_lock(&session->s_mutex);
1109 if (!list_empty(&session->s_cap_flushing)) {
1110 struct ceph_inode_info *ci =
1111 list_entry(session->s_cap_flushing.next,
1112 struct ceph_inode_info,
1113 i_flushing_item);
1114 struct inode *inode = &ci->vfs_inode;
1115
1116 spin_lock(&inode->i_lock);
1117 if (ci->i_cap_flush_seq <= want_flush_seq) {
1118 dout("check_cap_flush still flushing %p "
1119 "seq %lld <= %lld to mds%d\n", inode,
1120 ci->i_cap_flush_seq, want_flush_seq,
1121 session->s_mds);
1122 ret = 0;
1123 }
1124 spin_unlock(&inode->i_lock);
1125 }
1126 mutex_unlock(&session->s_mutex);
1127 ceph_put_mds_session(session);
1128
1129 if (!ret)
1130 return ret;
1131 mutex_lock(&mdsc->mutex);
1132 }
1133
1134 mutex_unlock(&mdsc->mutex);
1135 dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq);
1136 return ret;
1137}
1138
1139/*
1140 * called under s_mutex
1141 */
1142static void send_cap_releases(struct ceph_mds_client *mdsc,
1143 struct ceph_mds_session *session)
1144{
1145 struct ceph_msg *msg;
1146
1147 dout("send_cap_releases mds%d\n", session->s_mds);
1148 while (1) {
1149 spin_lock(&session->s_cap_lock);
1150 if (list_empty(&session->s_cap_releases_done))
1151 break;
1152 msg = list_first_entry(&session->s_cap_releases_done,
1153 struct ceph_msg, list_head);
1154 list_del_init(&msg->list_head);
1155 spin_unlock(&session->s_cap_lock);
1156 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
1157 dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
1158 ceph_con_send(&session->s_con, msg);
1159 }
1160 spin_unlock(&session->s_cap_lock);
1161}
1162
1163/*
1164 * requests
1165 */
1166
1167/*
1168 * Create an mds request.
1169 */
1170struct ceph_mds_request *
1171ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
1172{
1173 struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS);
1174
1175 if (!req)
1176 return ERR_PTR(-ENOMEM);
1177
1178 req->r_started = jiffies;
1179 req->r_resend_mds = -1;
1180 INIT_LIST_HEAD(&req->r_unsafe_dir_item);
1181 req->r_fmode = -1;
1182 kref_init(&req->r_kref);
1183 INIT_LIST_HEAD(&req->r_wait);
1184 init_completion(&req->r_completion);
1185 init_completion(&req->r_safe_completion);
1186 INIT_LIST_HEAD(&req->r_unsafe_item);
1187
1188 req->r_op = op;
1189 req->r_direct_mode = mode;
1190 return req;
1191}
1192
1193/*
1194 * return oldest (lowest) request, tid in request tree, 0 if none.
1195 *
1196 * called under mdsc->mutex.
1197 */
1198static struct ceph_mds_request *__get_oldest_req(struct ceph_mds_client *mdsc)
1199{
1200 if (RB_EMPTY_ROOT(&mdsc->request_tree))
1201 return NULL;
1202 return rb_entry(rb_first(&mdsc->request_tree),
1203 struct ceph_mds_request, r_node);
1204}
1205
1206static u64 __get_oldest_tid(struct ceph_mds_client *mdsc)
1207{
1208 struct ceph_mds_request *req = __get_oldest_req(mdsc);
1209
1210 if (req)
1211 return req->r_tid;
1212 return 0;
1213}
1214
1215/*
1216 * Build a dentry's path. Allocate on heap; caller must kfree. Based
1217 * on build_path_from_dentry in fs/cifs/dir.c.
1218 *
1219 * If @stop_on_nosnap, generate path relative to the first non-snapped
1220 * inode.
1221 *
1222 * Encode hidden .snap dirs as a double /, i.e.
1223 * foo/.snap/bar -> foo//bar
1224 */
1225char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
1226 int stop_on_nosnap)
1227{
1228 struct dentry *temp;
1229 char *path;
1230 int len, pos;
1231
1232 if (dentry == NULL)
1233 return ERR_PTR(-EINVAL);
1234
1235retry:
1236 len = 0;
1237 for (temp = dentry; !IS_ROOT(temp);) {
1238 struct inode *inode = temp->d_inode;
1239 if (inode && ceph_snap(inode) == CEPH_SNAPDIR)
1240 len++; /* slash only */
1241 else if (stop_on_nosnap && inode &&
1242 ceph_snap(inode) == CEPH_NOSNAP)
1243 break;
1244 else
1245 len += 1 + temp->d_name.len;
1246 temp = temp->d_parent;
1247 if (temp == NULL) {
1248 pr_err("build_path_dentry corrupt dentry %p\n", dentry);
1249 return ERR_PTR(-EINVAL);
1250 }
1251 }
1252 if (len)
1253 len--; /* no leading '/' */
1254
1255 path = kmalloc(len+1, GFP_NOFS);
1256 if (path == NULL)
1257 return ERR_PTR(-ENOMEM);
1258 pos = len;
1259 path[pos] = 0; /* trailing null */
1260 for (temp = dentry; !IS_ROOT(temp) && pos != 0; ) {
1261 struct inode *inode = temp->d_inode;
1262
1263 if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
1264 dout("build_path_dentry path+%d: %p SNAPDIR\n",
1265 pos, temp);
1266 } else if (stop_on_nosnap && inode &&
1267 ceph_snap(inode) == CEPH_NOSNAP) {
1268 break;
1269 } else {
1270 pos -= temp->d_name.len;
1271 if (pos < 0)
1272 break;
1273 strncpy(path + pos, temp->d_name.name,
1274 temp->d_name.len);
1275 dout("build_path_dentry path+%d: %p '%.*s'\n",
1276 pos, temp, temp->d_name.len, path + pos);
1277 }
1278 if (pos)
1279 path[--pos] = '/';
1280 temp = temp->d_parent;
1281 if (temp == NULL) {
1282 pr_err("build_path_dentry corrupt dentry\n");
1283 kfree(path);
1284 return ERR_PTR(-EINVAL);
1285 }
1286 }
1287 if (pos != 0) {
1288 pr_err("build_path_dentry did not end path lookup where "
1289 "expected, namelen is %d, pos is %d\n", len, pos);
1290 /* presumably this is only possible if racing with a
1291 rename of one of the parent directories (we can not
1292 lock the dentries above us to prevent this, but
1293 retrying should be harmless) */
1294 kfree(path);
1295 goto retry;
1296 }
1297
1298 *base = ceph_ino(temp->d_inode);
1299 *plen = len;
1300 dout("build_path_dentry on %p %d built %llx '%.*s'\n",
1301 dentry, atomic_read(&dentry->d_count), *base, len, path);
1302 return path;
1303}
1304
1305static int build_dentry_path(struct dentry *dentry,
1306 const char **ppath, int *ppathlen, u64 *pino,
1307 int *pfreepath)
1308{
1309 char *path;
1310
1311 if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) {
1312 *pino = ceph_ino(dentry->d_parent->d_inode);
1313 *ppath = dentry->d_name.name;
1314 *ppathlen = dentry->d_name.len;
1315 return 0;
1316 }
1317 path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
1318 if (IS_ERR(path))
1319 return PTR_ERR(path);
1320 *ppath = path;
1321 *pfreepath = 1;
1322 return 0;
1323}
1324
1325static int build_inode_path(struct inode *inode,
1326 const char **ppath, int *ppathlen, u64 *pino,
1327 int *pfreepath)
1328{
1329 struct dentry *dentry;
1330 char *path;
1331
1332 if (ceph_snap(inode) == CEPH_NOSNAP) {
1333 *pino = ceph_ino(inode);
1334 *ppathlen = 0;
1335 return 0;
1336 }
1337 dentry = d_find_alias(inode);
1338 path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
1339 dput(dentry);
1340 if (IS_ERR(path))
1341 return PTR_ERR(path);
1342 *ppath = path;
1343 *pfreepath = 1;
1344 return 0;
1345}
1346
1347/*
1348 * request arguments may be specified via an inode *, a dentry *, or
1349 * an explicit ino+path.
1350 */
1351static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
1352 const char *rpath, u64 rino,
1353 const char **ppath, int *pathlen,
1354 u64 *ino, int *freepath)
1355{
1356 int r = 0;
1357
1358 if (rinode) {
1359 r = build_inode_path(rinode, ppath, pathlen, ino, freepath);
1360 dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
1361 ceph_snap(rinode));
1362 } else if (rdentry) {
1363 r = build_dentry_path(rdentry, ppath, pathlen, ino, freepath);
1364 dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
1365 *ppath);
1366 } else if (rpath) {
1367 *ino = rino;
1368 *ppath = rpath;
1369 *pathlen = strlen(rpath);
1370 dout(" path %.*s\n", *pathlen, rpath);
1371 }
1372
1373 return r;
1374}
1375
1376/*
1377 * called under mdsc->mutex
1378 */
1379static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
1380 struct ceph_mds_request *req,
1381 int mds)
1382{
1383 struct ceph_msg *msg;
1384 struct ceph_mds_request_head *head;
1385 const char *path1 = NULL;
1386 const char *path2 = NULL;
1387 u64 ino1 = 0, ino2 = 0;
1388 int pathlen1 = 0, pathlen2 = 0;
1389 int freepath1 = 0, freepath2 = 0;
1390 int len;
1391 u16 releases;
1392 void *p, *end;
1393 int ret;
1394
1395 ret = set_request_path_attr(req->r_inode, req->r_dentry,
1396 req->r_path1, req->r_ino1.ino,
1397 &path1, &pathlen1, &ino1, &freepath1);
1398 if (ret < 0) {
1399 msg = ERR_PTR(ret);
1400 goto out;
1401 }
1402
1403 ret = set_request_path_attr(NULL, req->r_old_dentry,
1404 req->r_path2, req->r_ino2.ino,
1405 &path2, &pathlen2, &ino2, &freepath2);
1406 if (ret < 0) {
1407 msg = ERR_PTR(ret);
1408 goto out_free1;
1409 }
1410
1411 len = sizeof(*head) +
1412 pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64));
1413
1414 /* calculate (max) length for cap releases */
1415 len += sizeof(struct ceph_mds_request_release) *
1416 (!!req->r_inode_drop + !!req->r_dentry_drop +
1417 !!req->r_old_inode_drop + !!req->r_old_dentry_drop);
1418 if (req->r_dentry_drop)
1419 len += req->r_dentry->d_name.len;
1420 if (req->r_old_dentry_drop)
1421 len += req->r_old_dentry->d_name.len;
1422
1423 msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, 0, 0, NULL);
1424 if (IS_ERR(msg))
1425 goto out_free2;
1426
1427 msg->hdr.tid = cpu_to_le64(req->r_tid);
1428
1429 head = msg->front.iov_base;
1430 p = msg->front.iov_base + sizeof(*head);
1431 end = msg->front.iov_base + msg->front.iov_len;
1432
1433 head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
1434 head->op = cpu_to_le32(req->r_op);
1435 head->caller_uid = cpu_to_le32(current_fsuid());
1436 head->caller_gid = cpu_to_le32(current_fsgid());
1437 head->args = req->r_args;
1438
1439 ceph_encode_filepath(&p, end, ino1, path1);
1440 ceph_encode_filepath(&p, end, ino2, path2);
1441
1442 /* cap releases */
1443 releases = 0;
1444 if (req->r_inode_drop)
1445 releases += ceph_encode_inode_release(&p,
1446 req->r_inode ? req->r_inode : req->r_dentry->d_inode,
1447 mds, req->r_inode_drop, req->r_inode_unless, 0);
1448 if (req->r_dentry_drop)
1449 releases += ceph_encode_dentry_release(&p, req->r_dentry,
1450 mds, req->r_dentry_drop, req->r_dentry_unless);
1451 if (req->r_old_dentry_drop)
1452 releases += ceph_encode_dentry_release(&p, req->r_old_dentry,
1453 mds, req->r_old_dentry_drop, req->r_old_dentry_unless);
1454 if (req->r_old_inode_drop)
1455 releases += ceph_encode_inode_release(&p,
1456 req->r_old_dentry->d_inode,
1457 mds, req->r_old_inode_drop, req->r_old_inode_unless, 0);
1458 head->num_releases = cpu_to_le16(releases);
1459
1460 BUG_ON(p > end);
1461 msg->front.iov_len = p - msg->front.iov_base;
1462 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
1463
1464 msg->pages = req->r_pages;
1465 msg->nr_pages = req->r_num_pages;
1466 msg->hdr.data_len = cpu_to_le32(req->r_data_len);
1467 msg->hdr.data_off = cpu_to_le16(0);
1468
1469out_free2:
1470 if (freepath2)
1471 kfree((char *)path2);
1472out_free1:
1473 if (freepath1)
1474 kfree((char *)path1);
1475out:
1476 return msg;
1477}
1478
1479/*
1480 * called under mdsc->mutex if error, under no mutex if
1481 * success.
1482 */
1483static void complete_request(struct ceph_mds_client *mdsc,
1484 struct ceph_mds_request *req)
1485{
1486 if (req->r_callback)
1487 req->r_callback(mdsc, req);
1488 else
1489 complete(&req->r_completion);
1490}
1491
1492/*
1493 * called under mdsc->mutex
1494 */
1495static int __prepare_send_request(struct ceph_mds_client *mdsc,
1496 struct ceph_mds_request *req,
1497 int mds)
1498{
1499 struct ceph_mds_request_head *rhead;
1500 struct ceph_msg *msg;
1501 int flags = 0;
1502
1503 req->r_mds = mds;
1504 req->r_attempts++;
1505 dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req,
1506 req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
1507
1508 if (req->r_request) {
1509 ceph_msg_put(req->r_request);
1510 req->r_request = NULL;
1511 }
1512 msg = create_request_message(mdsc, req, mds);
1513 if (IS_ERR(msg)) {
1514 req->r_reply = ERR_PTR(PTR_ERR(msg));
1515 complete_request(mdsc, req);
1516 return -PTR_ERR(msg);
1517 }
1518 req->r_request = msg;
1519
1520 rhead = msg->front.iov_base;
1521 rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
1522 if (req->r_got_unsafe)
1523 flags |= CEPH_MDS_FLAG_REPLAY;
1524 if (req->r_locked_dir)
1525 flags |= CEPH_MDS_FLAG_WANT_DENTRY;
1526 rhead->flags = cpu_to_le32(flags);
1527 rhead->num_fwd = req->r_num_fwd;
1528 rhead->num_retry = req->r_attempts - 1;
1529
1530 dout(" r_locked_dir = %p\n", req->r_locked_dir);
1531
1532 if (req->r_target_inode && req->r_got_unsafe)
1533 rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
1534 else
1535 rhead->ino = 0;
1536 return 0;
1537}
1538
1539/*
1540 * send request, or put it on the appropriate wait list.
1541 */
1542static int __do_request(struct ceph_mds_client *mdsc,
1543 struct ceph_mds_request *req)
1544{
1545 struct ceph_mds_session *session = NULL;
1546 int mds = -1;
1547 int err = -EAGAIN;
1548
1549 if (req->r_reply)
1550 goto out;
1551
1552 if (req->r_timeout &&
1553 time_after_eq(jiffies, req->r_started + req->r_timeout)) {
1554 dout("do_request timed out\n");
1555 err = -EIO;
1556 goto finish;
1557 }
1558
1559 mds = __choose_mds(mdsc, req);
1560 if (mds < 0 ||
1561 ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
1562 dout("do_request no mds or not active, waiting for map\n");
1563 list_add(&req->r_wait, &mdsc->waiting_for_map);
1564 goto out;
1565 }
1566
1567 /* get, open session */
1568 session = __ceph_lookup_mds_session(mdsc, mds);
1569 if (!session)
1570 session = register_session(mdsc, mds);
1571 dout("do_request mds%d session %p state %s\n", mds, session,
1572 session_state_name(session->s_state));
1573 if (session->s_state != CEPH_MDS_SESSION_OPEN &&
1574 session->s_state != CEPH_MDS_SESSION_HUNG) {
1575 if (session->s_state == CEPH_MDS_SESSION_NEW ||
1576 session->s_state == CEPH_MDS_SESSION_CLOSING)
1577 __open_session(mdsc, session);
1578 list_add(&req->r_wait, &session->s_waiting);
1579 goto out_session;
1580 }
1581
1582 /* send request */
1583 req->r_session = get_session(session);
1584 req->r_resend_mds = -1; /* forget any previous mds hint */
1585
1586 if (req->r_request_started == 0) /* note request start time */
1587 req->r_request_started = jiffies;
1588
1589 err = __prepare_send_request(mdsc, req, mds);
1590 if (!err) {
1591 ceph_msg_get(req->r_request);
1592 ceph_con_send(&session->s_con, req->r_request);
1593 }
1594
1595out_session:
1596 ceph_put_mds_session(session);
1597out:
1598 return err;
1599
1600finish:
1601 req->r_reply = ERR_PTR(err);
1602 complete_request(mdsc, req);
1603 goto out;
1604}
1605
1606/*
1607 * called under mdsc->mutex
1608 */
1609static void __wake_requests(struct ceph_mds_client *mdsc,
1610 struct list_head *head)
1611{
1612 struct ceph_mds_request *req, *nreq;
1613
1614 list_for_each_entry_safe(req, nreq, head, r_wait) {
1615 list_del_init(&req->r_wait);
1616 __do_request(mdsc, req);
1617 }
1618}
1619
1620/*
1621 * Wake up threads with requests pending for @mds, so that they can
1622 * resubmit their requests to a possibly different mds. If @all is set,
1623 * wake up if their requests has been forwarded to @mds, too.
1624 */
1625static void kick_requests(struct ceph_mds_client *mdsc, int mds, int all)
1626{
1627 struct ceph_mds_request *req;
1628 struct rb_node *p;
1629
1630 dout("kick_requests mds%d\n", mds);
1631 for (p = rb_first(&mdsc->request_tree); p; p = rb_next(p)) {
1632 req = rb_entry(p, struct ceph_mds_request, r_node);
1633 if (req->r_got_unsafe)
1634 continue;
1635 if (req->r_session &&
1636 req->r_session->s_mds == mds) {
1637 dout(" kicking tid %llu\n", req->r_tid);
1638 put_request_session(req);
1639 __do_request(mdsc, req);
1640 }
1641 }
1642}
1643
1644void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
1645 struct ceph_mds_request *req)
1646{
1647 dout("submit_request on %p\n", req);
1648 mutex_lock(&mdsc->mutex);
1649 __register_request(mdsc, req, NULL);
1650 __do_request(mdsc, req);
1651 mutex_unlock(&mdsc->mutex);
1652}
1653
1654/*
1655 * Synchrously perform an mds request. Take care of all of the
1656 * session setup, forwarding, retry details.
1657 */
1658int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
1659 struct inode *dir,
1660 struct ceph_mds_request *req)
1661{
1662 int err;
1663
1664 dout("do_request on %p\n", req);
1665
1666 /* take CAP_PIN refs for r_inode, r_locked_dir, r_old_dentry */
1667 if (req->r_inode)
1668 ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
1669 if (req->r_locked_dir)
1670 ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
1671 if (req->r_old_dentry)
1672 ceph_get_cap_refs(
1673 ceph_inode(req->r_old_dentry->d_parent->d_inode),
1674 CEPH_CAP_PIN);
1675
1676 /* issue */
1677 mutex_lock(&mdsc->mutex);
1678 __register_request(mdsc, req, dir);
1679 __do_request(mdsc, req);
1680
1681 /* wait */
1682 if (!req->r_reply) {
1683 mutex_unlock(&mdsc->mutex);
1684 if (req->r_timeout) {
1685 err = (long)wait_for_completion_interruptible_timeout(
1686 &req->r_completion, req->r_timeout);
1687 if (err == 0)
1688 req->r_reply = ERR_PTR(-EIO);
1689 else if (err < 0)
1690 req->r_reply = ERR_PTR(err);
1691 } else {
1692 err = wait_for_completion_interruptible(
1693 &req->r_completion);
1694 if (err)
1695 req->r_reply = ERR_PTR(err);
1696 }
1697 mutex_lock(&mdsc->mutex);
1698 }
1699
1700 if (IS_ERR(req->r_reply)) {
1701 err = PTR_ERR(req->r_reply);
1702 req->r_reply = NULL;
1703
1704 if (err == -ERESTARTSYS) {
1705 /* aborted */
1706 req->r_aborted = true;
1707
1708 if (req->r_locked_dir &&
1709 (req->r_op & CEPH_MDS_OP_WRITE)) {
1710 struct ceph_inode_info *ci =
1711 ceph_inode(req->r_locked_dir);
1712
1713 dout("aborted, clearing I_COMPLETE on %p\n",
1714 req->r_locked_dir);
1715 spin_lock(&req->r_locked_dir->i_lock);
1716 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
1717 ci->i_release_count++;
1718 spin_unlock(&req->r_locked_dir->i_lock);
1719 }
1720 } else {
1721 /* clean up this request */
1722 __unregister_request(mdsc, req);
1723 if (!list_empty(&req->r_unsafe_item))
1724 list_del_init(&req->r_unsafe_item);
1725 complete(&req->r_safe_completion);
1726 }
1727 } else if (req->r_err) {
1728 err = req->r_err;
1729 } else {
1730 err = le32_to_cpu(req->r_reply_info.head->result);
1731 }
1732 mutex_unlock(&mdsc->mutex);
1733
1734 dout("do_request %p done, result %d\n", req, err);
1735 return err;
1736}
1737
1738/*
1739 * Handle mds reply.
1740 *
1741 * We take the session mutex and parse and process the reply immediately.
1742 * This preserves the logical ordering of replies, capabilities, etc., sent
1743 * by the MDS as they are applied to our local cache.
1744 */
1745static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
1746{
1747 struct ceph_mds_client *mdsc = session->s_mdsc;
1748 struct ceph_mds_request *req;
1749 struct ceph_mds_reply_head *head = msg->front.iov_base;
1750 struct ceph_mds_reply_info_parsed *rinfo; /* parsed reply info */
1751 u64 tid;
1752 int err, result;
1753 int mds = session->s_mds;
1754
1755 if (msg->front.iov_len < sizeof(*head)) {
1756 pr_err("mdsc_handle_reply got corrupt (short) reply\n");
1757 ceph_msg_dump(msg);
1758 return;
1759 }
1760
1761 /* get request, session */
1762 tid = le64_to_cpu(msg->hdr.tid);
1763 mutex_lock(&mdsc->mutex);
1764 req = __lookup_request(mdsc, tid);
1765 if (!req) {
1766 dout("handle_reply on unknown tid %llu\n", tid);
1767 mutex_unlock(&mdsc->mutex);
1768 return;
1769 }
1770 dout("handle_reply %p\n", req);
1771
1772 /* correct session? */
1773 if (!req->r_session && req->r_session != session) {
1774 pr_err("mdsc_handle_reply got %llu on session mds%d"
1775 " not mds%d\n", tid, session->s_mds,
1776 req->r_session ? req->r_session->s_mds : -1);
1777 mutex_unlock(&mdsc->mutex);
1778 goto out;
1779 }
1780
1781 /* dup? */
1782 if ((req->r_got_unsafe && !head->safe) ||
1783 (req->r_got_safe && head->safe)) {
1784 pr_warning("got a dup %s reply on %llu from mds%d\n",
1785 head->safe ? "safe" : "unsafe", tid, mds);
1786 mutex_unlock(&mdsc->mutex);
1787 goto out;
1788 }
1789
1790 result = le32_to_cpu(head->result);
1791
1792 /*
1793 * Tolerate 2 consecutive ESTALEs from the same mds.
1794 * FIXME: we should be looking at the cap migrate_seq.
1795 */
1796 if (result == -ESTALE) {
1797 req->r_direct_mode = USE_AUTH_MDS;
1798 req->r_num_stale++;
1799 if (req->r_num_stale <= 2) {
1800 __do_request(mdsc, req);
1801 mutex_unlock(&mdsc->mutex);
1802 goto out;
1803 }
1804 } else {
1805 req->r_num_stale = 0;
1806 }
1807
1808 if (head->safe) {
1809 req->r_got_safe = true;
1810 __unregister_request(mdsc, req);
1811 complete(&req->r_safe_completion);
1812
1813 if (req->r_got_unsafe) {
1814 /*
1815 * We already handled the unsafe response, now do the
1816 * cleanup. No need to examine the response; the MDS
1817 * doesn't include any result info in the safe
1818 * response. And even if it did, there is nothing
1819 * useful we could do with a revised return value.
1820 */
1821 dout("got safe reply %llu, mds%d\n", tid, mds);
1822 list_del_init(&req->r_unsafe_item);
1823
1824 /* last unsafe request during umount? */
1825 if (mdsc->stopping && !__get_oldest_req(mdsc))
1826 complete(&mdsc->safe_umount_waiters);
1827 mutex_unlock(&mdsc->mutex);
1828 goto out;
1829 }
1830 }
1831
1832 BUG_ON(req->r_reply);
1833
1834 if (!head->safe) {
1835 req->r_got_unsafe = true;
1836 list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
1837 }
1838
1839 dout("handle_reply tid %lld result %d\n", tid, result);
1840 rinfo = &req->r_reply_info;
1841 err = parse_reply_info(msg, rinfo);
1842 mutex_unlock(&mdsc->mutex);
1843
1844 mutex_lock(&session->s_mutex);
1845 if (err < 0) {
1846 pr_err("mdsc_handle_reply got corrupt reply mds%d\n", mds);
1847 ceph_msg_dump(msg);
1848 goto out_err;
1849 }
1850
1851 /* snap trace */
1852 if (rinfo->snapblob_len) {
1853 down_write(&mdsc->snap_rwsem);
1854 ceph_update_snap_trace(mdsc, rinfo->snapblob,
1855 rinfo->snapblob + rinfo->snapblob_len,
1856 le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP);
1857 downgrade_write(&mdsc->snap_rwsem);
1858 } else {
1859 down_read(&mdsc->snap_rwsem);
1860 }
1861
1862 /* insert trace into our cache */
1863 err = ceph_fill_trace(mdsc->client->sb, req, req->r_session);
1864 if (err == 0) {
1865 if (result == 0 && rinfo->dir_nr)
1866 ceph_readdir_prepopulate(req, req->r_session);
1867 ceph_unreserve_caps(&req->r_caps_reservation);
1868 }
1869
1870 up_read(&mdsc->snap_rwsem);
1871out_err:
1872 if (err) {
1873 req->r_err = err;
1874 } else {
1875 req->r_reply = msg;
1876 ceph_msg_get(msg);
1877 }
1878
1879 add_cap_releases(mdsc, req->r_session, -1);
1880 mutex_unlock(&session->s_mutex);
1881
1882 /* kick calling process */
1883 complete_request(mdsc, req);
1884out:
1885 ceph_mdsc_put_request(req);
1886 return;
1887}
1888
1889
1890
1891/*
1892 * handle mds notification that our request has been forwarded.
1893 */
1894static void handle_forward(struct ceph_mds_client *mdsc,
1895 struct ceph_mds_session *session,
1896 struct ceph_msg *msg)
1897{
1898 struct ceph_mds_request *req;
1899 u64 tid = le64_to_cpu(msg->hdr.tid);
1900 u32 next_mds;
1901 u32 fwd_seq;
1902 int err = -EINVAL;
1903 void *p = msg->front.iov_base;
1904 void *end = p + msg->front.iov_len;
1905
1906 ceph_decode_need(&p, end, 2*sizeof(u32), bad);
1907 next_mds = ceph_decode_32(&p);
1908 fwd_seq = ceph_decode_32(&p);
1909
1910 mutex_lock(&mdsc->mutex);
1911 req = __lookup_request(mdsc, tid);
1912 if (!req) {
1913 dout("forward %llu to mds%d - req dne\n", tid, next_mds);
1914 goto out; /* dup reply? */
1915 }
1916
1917 if (fwd_seq <= req->r_num_fwd) {
1918 dout("forward %llu to mds%d - old seq %d <= %d\n",
1919 tid, next_mds, req->r_num_fwd, fwd_seq);
1920 } else {
1921 /* resend. forward race not possible; mds would drop */
1922 dout("forward %llu to mds%d (we resend)\n", tid, next_mds);
1923 req->r_num_fwd = fwd_seq;
1924 req->r_resend_mds = next_mds;
1925 put_request_session(req);
1926 __do_request(mdsc, req);
1927 }
1928 ceph_mdsc_put_request(req);
1929out:
1930 mutex_unlock(&mdsc->mutex);
1931 return;
1932
1933bad:
1934 pr_err("mdsc_handle_forward decode error err=%d\n", err);
1935}
1936
1937/*
1938 * handle a mds session control message
1939 */
1940static void handle_session(struct ceph_mds_session *session,
1941 struct ceph_msg *msg)
1942{
1943 struct ceph_mds_client *mdsc = session->s_mdsc;
1944 u32 op;
1945 u64 seq;
1946 int mds = session->s_mds;
1947 struct ceph_mds_session_head *h = msg->front.iov_base;
1948 int wake = 0;
1949
1950 /* decode */
1951 if (msg->front.iov_len != sizeof(*h))
1952 goto bad;
1953 op = le32_to_cpu(h->op);
1954 seq = le64_to_cpu(h->seq);
1955
1956 mutex_lock(&mdsc->mutex);
1957 if (op == CEPH_SESSION_CLOSE)
1958 __unregister_session(mdsc, session);
1959 /* FIXME: this ttl calculation is generous */
1960 session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose;
1961 mutex_unlock(&mdsc->mutex);
1962
1963 mutex_lock(&session->s_mutex);
1964
1965 dout("handle_session mds%d %s %p state %s seq %llu\n",
1966 mds, ceph_session_op_name(op), session,
1967 session_state_name(session->s_state), seq);
1968
1969 if (session->s_state == CEPH_MDS_SESSION_HUNG) {
1970 session->s_state = CEPH_MDS_SESSION_OPEN;
1971 pr_info("mds%d came back\n", session->s_mds);
1972 }
1973
1974 switch (op) {
1975 case CEPH_SESSION_OPEN:
1976 session->s_state = CEPH_MDS_SESSION_OPEN;
1977 renewed_caps(mdsc, session, 0);
1978 wake = 1;
1979 if (mdsc->stopping)
1980 __close_session(mdsc, session);
1981 break;
1982
1983 case CEPH_SESSION_RENEWCAPS:
1984 if (session->s_renew_seq == seq)
1985 renewed_caps(mdsc, session, 1);
1986 break;
1987
1988 case CEPH_SESSION_CLOSE:
1989 remove_session_caps(session);
1990 wake = 1; /* for good measure */
1991 complete(&mdsc->session_close_waiters);
1992 kick_requests(mdsc, mds, 0); /* cur only */
1993 break;
1994
1995 case CEPH_SESSION_STALE:
1996 pr_info("mds%d caps went stale, renewing\n",
1997 session->s_mds);
1998 spin_lock(&session->s_cap_lock);
1999 session->s_cap_gen++;
2000 session->s_cap_ttl = 0;
2001 spin_unlock(&session->s_cap_lock);
2002 send_renew_caps(mdsc, session);
2003 break;
2004
2005 case CEPH_SESSION_RECALL_STATE:
2006 trim_caps(mdsc, session, le32_to_cpu(h->max_caps));
2007 break;
2008
2009 default:
2010 pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds);
2011 WARN_ON(1);
2012 }
2013
2014 mutex_unlock(&session->s_mutex);
2015 if (wake) {
2016 mutex_lock(&mdsc->mutex);
2017 __wake_requests(mdsc, &session->s_waiting);
2018 mutex_unlock(&mdsc->mutex);
2019 }
2020 return;
2021
2022bad:
2023 pr_err("mdsc_handle_session corrupt message mds%d len %d\n", mds,
2024 (int)msg->front.iov_len);
2025 ceph_msg_dump(msg);
2026 return;
2027}
2028
2029
2030/*
2031 * called under session->mutex.
2032 */
2033static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
2034 struct ceph_mds_session *session)
2035{
2036 struct ceph_mds_request *req, *nreq;
2037 int err;
2038
2039 dout("replay_unsafe_requests mds%d\n", session->s_mds);
2040
2041 mutex_lock(&mdsc->mutex);
2042 list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) {
2043 err = __prepare_send_request(mdsc, req, session->s_mds);
2044 if (!err) {
2045 ceph_msg_get(req->r_request);
2046 ceph_con_send(&session->s_con, req->r_request);
2047 }
2048 }
2049 mutex_unlock(&mdsc->mutex);
2050}
2051
2052/*
2053 * Encode information about a cap for a reconnect with the MDS.
2054 */
2055static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
2056 void *arg)
2057{
2058 struct ceph_mds_cap_reconnect rec;
2059 struct ceph_inode_info *ci;
2060 struct ceph_pagelist *pagelist = arg;
2061 char *path;
2062 int pathlen, err;
2063 u64 pathbase;
2064 struct dentry *dentry;
2065
2066 ci = cap->ci;
2067
2068 dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
2069 inode, ceph_vinop(inode), cap, cap->cap_id,
2070 ceph_cap_string(cap->issued));
2071 err = ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
2072 if (err)
2073 return err;
2074
2075 dentry = d_find_alias(inode);
2076 if (dentry) {
2077 path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0);
2078 if (IS_ERR(path)) {
2079 err = PTR_ERR(path);
2080 BUG_ON(err);
2081 }
2082 } else {
2083 path = NULL;
2084 pathlen = 0;
2085 }
2086 err = ceph_pagelist_encode_string(pagelist, path, pathlen);
2087 if (err)
2088 goto out;
2089
2090 spin_lock(&inode->i_lock);
2091 cap->seq = 0; /* reset cap seq */
2092 cap->issue_seq = 0; /* and issue_seq */
2093 rec.cap_id = cpu_to_le64(cap->cap_id);
2094 rec.pathbase = cpu_to_le64(pathbase);
2095 rec.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
2096 rec.issued = cpu_to_le32(cap->issued);
2097 rec.size = cpu_to_le64(inode->i_size);
2098 ceph_encode_timespec(&rec.mtime, &inode->i_mtime);
2099 ceph_encode_timespec(&rec.atime, &inode->i_atime);
2100 rec.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
2101 spin_unlock(&inode->i_lock);
2102
2103 err = ceph_pagelist_append(pagelist, &rec, sizeof(rec));
2104
2105out:
2106 kfree(path);
2107 dput(dentry);
2108 return err;
2109}
2110
2111
2112/*
2113 * If an MDS fails and recovers, clients need to reconnect in order to
2114 * reestablish shared state. This includes all caps issued through
2115 * this session _and_ the snap_realm hierarchy. Because it's not
2116 * clear which snap realms the mds cares about, we send everything we
2117 * know about.. that ensures we'll then get any new info the
2118 * recovering MDS might have.
2119 *
2120 * This is a relatively heavyweight operation, but it's rare.
2121 *
2122 * called with mdsc->mutex held.
2123 */
2124static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
2125{
2126 struct ceph_mds_session *session = NULL;
2127 struct ceph_msg *reply;
2128 struct rb_node *p;
2129 int err;
2130 struct ceph_pagelist *pagelist;
2131
2132 pr_info("reconnect to recovering mds%d\n", mds);
2133
2134 pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
2135 if (!pagelist)
2136 goto fail_nopagelist;
2137 ceph_pagelist_init(pagelist);
2138
2139 reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, 0, 0, NULL);
2140 if (IS_ERR(reply)) {
2141 err = PTR_ERR(reply);
2142 goto fail_nomsg;
2143 }
2144
2145 /* find session */
2146 session = __ceph_lookup_mds_session(mdsc, mds);
2147 mutex_unlock(&mdsc->mutex); /* drop lock for duration */
2148
2149 if (session) {
2150 mutex_lock(&session->s_mutex);
2151
2152 session->s_state = CEPH_MDS_SESSION_RECONNECTING;
2153 session->s_seq = 0;
2154
2155 ceph_con_open(&session->s_con,
2156 ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
2157
2158 /* replay unsafe requests */
2159 replay_unsafe_requests(mdsc, session);
2160 } else {
2161 dout("no session for mds%d, will send short reconnect\n",
2162 mds);
2163 }
2164
2165 down_read(&mdsc->snap_rwsem);
2166
2167 if (!session)
2168 goto send;
2169 dout("session %p state %s\n", session,
2170 session_state_name(session->s_state));
2171
2172 /* traverse this session's caps */
2173 err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps);
2174 if (err)
2175 goto fail;
2176 err = iterate_session_caps(session, encode_caps_cb, pagelist);
2177 if (err < 0)
2178 goto out;
2179
2180 /*
2181 * snaprealms. we provide mds with the ino, seq (version), and
2182 * parent for all of our realms. If the mds has any newer info,
2183 * it will tell us.
2184 */
2185 for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) {
2186 struct ceph_snap_realm *realm =
2187 rb_entry(p, struct ceph_snap_realm, node);
2188 struct ceph_mds_snaprealm_reconnect sr_rec;
2189
2190 dout(" adding snap realm %llx seq %lld parent %llx\n",
2191 realm->ino, realm->seq, realm->parent_ino);
2192 sr_rec.ino = cpu_to_le64(realm->ino);
2193 sr_rec.seq = cpu_to_le64(realm->seq);
2194 sr_rec.parent = cpu_to_le64(realm->parent_ino);
2195 err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec));
2196 if (err)
2197 goto fail;
2198 }
2199
2200send:
2201 reply->pagelist = pagelist;
2202 reply->hdr.data_len = cpu_to_le32(pagelist->length);
2203 reply->nr_pages = calc_pages_for(0, pagelist->length);
2204 ceph_con_send(&session->s_con, reply);
2205
2206 if (session) {
2207 session->s_state = CEPH_MDS_SESSION_OPEN;
2208 __wake_requests(mdsc, &session->s_waiting);
2209 }
2210
2211out:
2212 up_read(&mdsc->snap_rwsem);
2213 if (session) {
2214 mutex_unlock(&session->s_mutex);
2215 ceph_put_mds_session(session);
2216 }
2217 mutex_lock(&mdsc->mutex);
2218 return;
2219
2220fail:
2221 ceph_msg_put(reply);
2222fail_nomsg:
2223 ceph_pagelist_release(pagelist);
2224 kfree(pagelist);
2225fail_nopagelist:
2226 pr_err("ENOMEM preparing reconnect for mds%d\n", mds);
2227 goto out;
2228}
2229
2230
2231/*
2232 * compare old and new mdsmaps, kicking requests
2233 * and closing out old connections as necessary
2234 *
2235 * called under mdsc->mutex.
2236 */
2237static void check_new_map(struct ceph_mds_client *mdsc,
2238 struct ceph_mdsmap *newmap,
2239 struct ceph_mdsmap *oldmap)
2240{
2241 int i;
2242 int oldstate, newstate;
2243 struct ceph_mds_session *s;
2244
2245 dout("check_new_map new %u old %u\n",
2246 newmap->m_epoch, oldmap->m_epoch);
2247
2248 for (i = 0; i < oldmap->m_max_mds && i < mdsc->max_sessions; i++) {
2249 if (mdsc->sessions[i] == NULL)
2250 continue;
2251 s = mdsc->sessions[i];
2252 oldstate = ceph_mdsmap_get_state(oldmap, i);
2253 newstate = ceph_mdsmap_get_state(newmap, i);
2254
2255 dout("check_new_map mds%d state %s -> %s (session %s)\n",
2256 i, ceph_mds_state_name(oldstate),
2257 ceph_mds_state_name(newstate),
2258 session_state_name(s->s_state));
2259
2260 if (memcmp(ceph_mdsmap_get_addr(oldmap, i),
2261 ceph_mdsmap_get_addr(newmap, i),
2262 sizeof(struct ceph_entity_addr))) {
2263 if (s->s_state == CEPH_MDS_SESSION_OPENING) {
2264 /* the session never opened, just close it
2265 * out now */
2266 __wake_requests(mdsc, &s->s_waiting);
2267 __unregister_session(mdsc, s);
2268 } else {
2269 /* just close it */
2270 mutex_unlock(&mdsc->mutex);
2271 mutex_lock(&s->s_mutex);
2272 mutex_lock(&mdsc->mutex);
2273 ceph_con_close(&s->s_con);
2274 mutex_unlock(&s->s_mutex);
2275 s->s_state = CEPH_MDS_SESSION_RESTARTING;
2276 }
2277
2278 /* kick any requests waiting on the recovering mds */
2279 kick_requests(mdsc, i, 1);
2280 } else if (oldstate == newstate) {
2281 continue; /* nothing new with this mds */
2282 }
2283
2284 /*
2285 * send reconnect?
2286 */
2287 if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&
2288 newstate >= CEPH_MDS_STATE_RECONNECT)
2289 send_mds_reconnect(mdsc, i);
2290
2291 /*
2292 * kick requests on any mds that has gone active.
2293 *
2294 * kick requests on cur or forwarder: we may have sent
2295 * the request to mds1, mds1 told us it forwarded it
2296 * to mds2, but then we learn mds1 failed and can't be
2297 * sure it successfully forwarded our request before
2298 * it died.
2299 */
2300 if (oldstate < CEPH_MDS_STATE_ACTIVE &&
2301 newstate >= CEPH_MDS_STATE_ACTIVE) {
2302 pr_info("mds%d reconnect completed\n", s->s_mds);
2303 kick_requests(mdsc, i, 1);
2304 ceph_kick_flushing_caps(mdsc, s);
2305 wake_up_session_caps(s, 1);
2306 }
2307 }
2308}
2309
2310
2311
2312/*
2313 * leases
2314 */
2315
2316/*
2317 * caller must hold session s_mutex, dentry->d_lock
2318 */
2319void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry)
2320{
2321 struct ceph_dentry_info *di = ceph_dentry(dentry);
2322
2323 ceph_put_mds_session(di->lease_session);
2324 di->lease_session = NULL;
2325}
2326
2327static void handle_lease(struct ceph_mds_client *mdsc,
2328 struct ceph_mds_session *session,
2329 struct ceph_msg *msg)
2330{
2331 struct super_block *sb = mdsc->client->sb;
2332 struct inode *inode;
2333 struct ceph_inode_info *ci;
2334 struct dentry *parent, *dentry;
2335 struct ceph_dentry_info *di;
2336 int mds = session->s_mds;
2337 struct ceph_mds_lease *h = msg->front.iov_base;
2338 struct ceph_vino vino;
2339 int mask;
2340 struct qstr dname;
2341 int release = 0;
2342
2343 dout("handle_lease from mds%d\n", mds);
2344
2345 /* decode */
2346 if (msg->front.iov_len < sizeof(*h) + sizeof(u32))
2347 goto bad;
2348 vino.ino = le64_to_cpu(h->ino);
2349 vino.snap = CEPH_NOSNAP;
2350 mask = le16_to_cpu(h->mask);
2351 dname.name = (void *)h + sizeof(*h) + sizeof(u32);
2352 dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32);
2353 if (dname.len != get_unaligned_le32(h+1))
2354 goto bad;
2355
2356 mutex_lock(&session->s_mutex);
2357 session->s_seq++;
2358
2359 /* lookup inode */
2360 inode = ceph_find_inode(sb, vino);
2361 dout("handle_lease '%s', mask %d, ino %llx %p\n",
2362 ceph_lease_op_name(h->action), mask, vino.ino, inode);
2363 if (inode == NULL) {
2364 dout("handle_lease no inode %llx\n", vino.ino);
2365 goto release;
2366 }
2367 ci = ceph_inode(inode);
2368
2369 /* dentry */
2370 parent = d_find_alias(inode);
2371 if (!parent) {
2372 dout("no parent dentry on inode %p\n", inode);
2373 WARN_ON(1);
2374 goto release; /* hrm... */
2375 }
2376 dname.hash = full_name_hash(dname.name, dname.len);
2377 dentry = d_lookup(parent, &dname);
2378 dput(parent);
2379 if (!dentry)
2380 goto release;
2381
2382 spin_lock(&dentry->d_lock);
2383 di = ceph_dentry(dentry);
2384 switch (h->action) {
2385 case CEPH_MDS_LEASE_REVOKE:
2386 if (di && di->lease_session == session) {
2387 h->seq = cpu_to_le32(di->lease_seq);
2388 __ceph_mdsc_drop_dentry_lease(dentry);
2389 }
2390 release = 1;
2391 break;
2392
2393 case CEPH_MDS_LEASE_RENEW:
2394 if (di && di->lease_session == session &&
2395 di->lease_gen == session->s_cap_gen &&
2396 di->lease_renew_from &&
2397 di->lease_renew_after == 0) {
2398 unsigned long duration =
2399 le32_to_cpu(h->duration_ms) * HZ / 1000;
2400
2401 di->lease_seq = le32_to_cpu(h->seq);
2402 dentry->d_time = di->lease_renew_from + duration;
2403 di->lease_renew_after = di->lease_renew_from +
2404 (duration >> 1);
2405 di->lease_renew_from = 0;
2406 }
2407 break;
2408 }
2409 spin_unlock(&dentry->d_lock);
2410 dput(dentry);
2411
2412 if (!release)
2413 goto out;
2414
2415release:
2416 /* let's just reuse the same message */
2417 h->action = CEPH_MDS_LEASE_REVOKE_ACK;
2418 ceph_msg_get(msg);
2419 ceph_con_send(&session->s_con, msg);
2420
2421out:
2422 iput(inode);
2423 mutex_unlock(&session->s_mutex);
2424 return;
2425
2426bad:
2427 pr_err("corrupt lease message\n");
2428 ceph_msg_dump(msg);
2429}
2430
2431void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
2432 struct inode *inode,
2433 struct dentry *dentry, char action,
2434 u32 seq)
2435{
2436 struct ceph_msg *msg;
2437 struct ceph_mds_lease *lease;
2438 int len = sizeof(*lease) + sizeof(u32);
2439 int dnamelen = 0;
2440
2441 dout("lease_send_msg inode %p dentry %p %s to mds%d\n",
2442 inode, dentry, ceph_lease_op_name(action), session->s_mds);
2443 dnamelen = dentry->d_name.len;
2444 len += dnamelen;
2445
2446 msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, 0, 0, NULL);
2447 if (IS_ERR(msg))
2448 return;
2449 lease = msg->front.iov_base;
2450 lease->action = action;
2451 lease->mask = cpu_to_le16(CEPH_LOCK_DN);
2452 lease->ino = cpu_to_le64(ceph_vino(inode).ino);
2453 lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap);
2454 lease->seq = cpu_to_le32(seq);
2455 put_unaligned_le32(dnamelen, lease + 1);
2456 memcpy((void *)(lease + 1) + 4, dentry->d_name.name, dnamelen);
2457
2458 /*
2459 * if this is a preemptive lease RELEASE, no need to
2460 * flush request stream, since the actual request will
2461 * soon follow.
2462 */
2463 msg->more_to_follow = (action == CEPH_MDS_LEASE_RELEASE);
2464
2465 ceph_con_send(&session->s_con, msg);
2466}
2467
2468/*
2469 * Preemptively release a lease we expect to invalidate anyway.
2470 * Pass @inode always, @dentry is optional.
2471 */
2472void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
2473 struct dentry *dentry, int mask)
2474{
2475 struct ceph_dentry_info *di;
2476 struct ceph_mds_session *session;
2477 u32 seq;
2478
2479 BUG_ON(inode == NULL);
2480 BUG_ON(dentry == NULL);
2481 BUG_ON(mask != CEPH_LOCK_DN);
2482
2483 /* is dentry lease valid? */
2484 spin_lock(&dentry->d_lock);
2485 di = ceph_dentry(dentry);
2486 if (!di || !di->lease_session ||
2487 di->lease_session->s_mds < 0 ||
2488 di->lease_gen != di->lease_session->s_cap_gen ||
2489 !time_before(jiffies, dentry->d_time)) {
2490 dout("lease_release inode %p dentry %p -- "
2491 "no lease on %d\n",
2492 inode, dentry, mask);
2493 spin_unlock(&dentry->d_lock);
2494 return;
2495 }
2496
2497 /* we do have a lease on this dentry; note mds and seq */
2498 session = ceph_get_mds_session(di->lease_session);
2499 seq = di->lease_seq;
2500 __ceph_mdsc_drop_dentry_lease(dentry);
2501 spin_unlock(&dentry->d_lock);
2502
2503 dout("lease_release inode %p dentry %p mask %d to mds%d\n",
2504 inode, dentry, mask, session->s_mds);
2505 ceph_mdsc_lease_send_msg(session, inode, dentry,
2506 CEPH_MDS_LEASE_RELEASE, seq);
2507 ceph_put_mds_session(session);
2508}
2509
2510/*
2511 * drop all leases (and dentry refs) in preparation for umount
2512 */
2513static void drop_leases(struct ceph_mds_client *mdsc)
2514{
2515 int i;
2516
2517 dout("drop_leases\n");
2518 mutex_lock(&mdsc->mutex);
2519 for (i = 0; i < mdsc->max_sessions; i++) {
2520 struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
2521 if (!s)
2522 continue;
2523 mutex_unlock(&mdsc->mutex);
2524 mutex_lock(&s->s_mutex);
2525 mutex_unlock(&s->s_mutex);
2526 ceph_put_mds_session(s);
2527 mutex_lock(&mdsc->mutex);
2528 }
2529 mutex_unlock(&mdsc->mutex);
2530}
2531
2532
2533
2534/*
2535 * delayed work -- periodically trim expired leases, renew caps with mds
2536 */
2537static void schedule_delayed(struct ceph_mds_client *mdsc)
2538{
2539 int delay = 5;
2540 unsigned hz = round_jiffies_relative(HZ * delay);
2541 schedule_delayed_work(&mdsc->delayed_work, hz);
2542}
2543
2544static void delayed_work(struct work_struct *work)
2545{
2546 int i;
2547 struct ceph_mds_client *mdsc =
2548 container_of(work, struct ceph_mds_client, delayed_work.work);
2549 int renew_interval;
2550 int renew_caps;
2551
2552 dout("mdsc delayed_work\n");
2553 ceph_check_delayed_caps(mdsc);
2554
2555 mutex_lock(&mdsc->mutex);
2556 renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
2557 renew_caps = time_after_eq(jiffies, HZ*renew_interval +
2558 mdsc->last_renew_caps);
2559 if (renew_caps)
2560 mdsc->last_renew_caps = jiffies;
2561
2562 for (i = 0; i < mdsc->max_sessions; i++) {
2563 struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
2564 if (s == NULL)
2565 continue;
2566 if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
2567 dout("resending session close request for mds%d\n",
2568 s->s_mds);
2569 request_close_session(mdsc, s);
2570 ceph_put_mds_session(s);
2571 continue;
2572 }
2573 if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
2574 if (s->s_state == CEPH_MDS_SESSION_OPEN) {
2575 s->s_state = CEPH_MDS_SESSION_HUNG;
2576 pr_info("mds%d hung\n", s->s_mds);
2577 }
2578 }
2579 if (s->s_state < CEPH_MDS_SESSION_OPEN) {
2580 /* this mds is failed or recovering, just wait */
2581 ceph_put_mds_session(s);
2582 continue;
2583 }
2584 mutex_unlock(&mdsc->mutex);
2585
2586 mutex_lock(&s->s_mutex);
2587 if (renew_caps)
2588 send_renew_caps(mdsc, s);
2589 else
2590 ceph_con_keepalive(&s->s_con);
2591 add_cap_releases(mdsc, s, -1);
2592 send_cap_releases(mdsc, s);
2593 mutex_unlock(&s->s_mutex);
2594 ceph_put_mds_session(s);
2595
2596 mutex_lock(&mdsc->mutex);
2597 }
2598 mutex_unlock(&mdsc->mutex);
2599
2600 schedule_delayed(mdsc);
2601}
2602
2603
2604int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
2605{
2606 mdsc->client = client;
2607 mutex_init(&mdsc->mutex);
2608 mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
2609 init_completion(&mdsc->safe_umount_waiters);
2610 init_completion(&mdsc->session_close_waiters);
2611 INIT_LIST_HEAD(&mdsc->waiting_for_map);
2612 mdsc->sessions = NULL;
2613 mdsc->max_sessions = 0;
2614 mdsc->stopping = 0;
2615 init_rwsem(&mdsc->snap_rwsem);
2616 mdsc->snap_realms = RB_ROOT;
2617 INIT_LIST_HEAD(&mdsc->snap_empty);
2618 spin_lock_init(&mdsc->snap_empty_lock);
2619 mdsc->last_tid = 0;
2620 mdsc->request_tree = RB_ROOT;
2621 INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
2622 mdsc->last_renew_caps = jiffies;
2623 INIT_LIST_HEAD(&mdsc->cap_delay_list);
2624 spin_lock_init(&mdsc->cap_delay_lock);
2625 INIT_LIST_HEAD(&mdsc->snap_flush_list);
2626 spin_lock_init(&mdsc->snap_flush_lock);
2627 mdsc->cap_flush_seq = 0;
2628 INIT_LIST_HEAD(&mdsc->cap_dirty);
2629 mdsc->num_cap_flushing = 0;
2630 spin_lock_init(&mdsc->cap_dirty_lock);
2631 init_waitqueue_head(&mdsc->cap_flushing_wq);
2632 spin_lock_init(&mdsc->dentry_lru_lock);
2633 INIT_LIST_HEAD(&mdsc->dentry_lru);
2634 return 0;
2635}
2636
2637/*
2638 * Wait for safe replies on open mds requests. If we time out, drop
2639 * all requests from the tree to avoid dangling dentry refs.
2640 */
2641static void wait_requests(struct ceph_mds_client *mdsc)
2642{
2643 struct ceph_mds_request *req;
2644 struct ceph_client *client = mdsc->client;
2645
2646 mutex_lock(&mdsc->mutex);
2647 if (__get_oldest_req(mdsc)) {
2648 mutex_unlock(&mdsc->mutex);
2649
2650 dout("wait_requests waiting for requests\n");
2651 wait_for_completion_timeout(&mdsc->safe_umount_waiters,
2652 client->mount_args->mount_timeout * HZ);
2653
2654 /* tear down remaining requests */
2655 mutex_lock(&mdsc->mutex);
2656 while ((req = __get_oldest_req(mdsc))) {
2657 dout("wait_requests timed out on tid %llu\n",
2658 req->r_tid);
2659 __unregister_request(mdsc, req);
2660 }
2661 }
2662 mutex_unlock(&mdsc->mutex);
2663 dout("wait_requests done\n");
2664}
2665
2666/*
2667 * called before mount is ro, and before dentries are torn down.
2668 * (hmm, does this still race with new lookups?)
2669 */
2670void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
2671{
2672 dout("pre_umount\n");
2673 mdsc->stopping = 1;
2674
2675 drop_leases(mdsc);
2676 ceph_flush_dirty_caps(mdsc);
2677 wait_requests(mdsc);
2678}
2679
2680/*
2681 * wait for all write mds requests to flush.
2682 */
2683static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid)
2684{
2685 struct ceph_mds_request *req = NULL;
2686 struct rb_node *n;
2687
2688 mutex_lock(&mdsc->mutex);
2689 dout("wait_unsafe_requests want %lld\n", want_tid);
2690 req = __get_oldest_req(mdsc);
2691 while (req && req->r_tid <= want_tid) {
2692 if ((req->r_op & CEPH_MDS_OP_WRITE)) {
2693 /* write op */
2694 ceph_mdsc_get_request(req);
2695 mutex_unlock(&mdsc->mutex);
2696 dout("wait_unsafe_requests wait on %llu (want %llu)\n",
2697 req->r_tid, want_tid);
2698 wait_for_completion(&req->r_safe_completion);
2699 mutex_lock(&mdsc->mutex);
2700 n = rb_next(&req->r_node);
2701 ceph_mdsc_put_request(req);
2702 } else {
2703 n = rb_next(&req->r_node);
2704 }
2705 if (!n)
2706 break;
2707 req = rb_entry(n, struct ceph_mds_request, r_node);
2708 }
2709 mutex_unlock(&mdsc->mutex);
2710 dout("wait_unsafe_requests done\n");
2711}
2712
2713void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
2714{
2715 u64 want_tid, want_flush;
2716
2717 dout("sync\n");
2718 mutex_lock(&mdsc->mutex);
2719 want_tid = mdsc->last_tid;
2720 want_flush = mdsc->cap_flush_seq;
2721 mutex_unlock(&mdsc->mutex);
2722 dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
2723
2724 ceph_flush_dirty_caps(mdsc);
2725
2726 wait_unsafe_requests(mdsc, want_tid);
2727 wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush));
2728}
2729
2730
2731/*
2732 * called after sb is ro.
2733 */
2734void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
2735{
2736 struct ceph_mds_session *session;
2737 int i;
2738 int n;
2739 struct ceph_client *client = mdsc->client;
2740 unsigned long started, timeout = client->mount_args->mount_timeout * HZ;
2741
2742 dout("close_sessions\n");
2743
2744 mutex_lock(&mdsc->mutex);
2745
2746 /* close sessions */
2747 started = jiffies;
2748 while (time_before(jiffies, started + timeout)) {
2749 dout("closing sessions\n");
2750 n = 0;
2751 for (i = 0; i < mdsc->max_sessions; i++) {
2752 session = __ceph_lookup_mds_session(mdsc, i);
2753 if (!session)
2754 continue;
2755 mutex_unlock(&mdsc->mutex);
2756 mutex_lock(&session->s_mutex);
2757 __close_session(mdsc, session);
2758 mutex_unlock(&session->s_mutex);
2759 ceph_put_mds_session(session);
2760 mutex_lock(&mdsc->mutex);
2761 n++;
2762 }
2763 if (n == 0)
2764 break;
2765
2766 if (client->mount_state == CEPH_MOUNT_SHUTDOWN)
2767 break;
2768
2769 dout("waiting for sessions to close\n");
2770 mutex_unlock(&mdsc->mutex);
2771 wait_for_completion_timeout(&mdsc->session_close_waiters,
2772 timeout);
2773 mutex_lock(&mdsc->mutex);
2774 }
2775
2776 /* tear down remaining sessions */
2777 for (i = 0; i < mdsc->max_sessions; i++) {
2778 if (mdsc->sessions[i]) {
2779 session = get_session(mdsc->sessions[i]);
2780 __unregister_session(mdsc, session);
2781 mutex_unlock(&mdsc->mutex);
2782 mutex_lock(&session->s_mutex);
2783 remove_session_caps(session);
2784 mutex_unlock(&session->s_mutex);
2785 ceph_put_mds_session(session);
2786 mutex_lock(&mdsc->mutex);
2787 }
2788 }
2789
2790 WARN_ON(!list_empty(&mdsc->cap_delay_list));
2791
2792 mutex_unlock(&mdsc->mutex);
2793
2794 ceph_cleanup_empty_realms(mdsc);
2795
2796 cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
2797
2798 dout("stopped\n");
2799}
2800
2801void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
2802{
2803 dout("stop\n");
2804 cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
2805 if (mdsc->mdsmap)
2806 ceph_mdsmap_destroy(mdsc->mdsmap);
2807 kfree(mdsc->sessions);
2808}
2809
2810
2811/*
2812 * handle mds map update.
2813 */
2814void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
2815{
2816 u32 epoch;
2817 u32 maplen;
2818 void *p = msg->front.iov_base;
2819 void *end = p + msg->front.iov_len;
2820 struct ceph_mdsmap *newmap, *oldmap;
2821 struct ceph_fsid fsid;
2822 int err = -EINVAL;
2823
2824 ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad);
2825 ceph_decode_copy(&p, &fsid, sizeof(fsid));
2826 if (ceph_check_fsid(mdsc->client, &fsid) < 0)
2827 return;
2828 epoch = ceph_decode_32(&p);
2829 maplen = ceph_decode_32(&p);
2830 dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
2831
2832 /* do we need it? */
2833 ceph_monc_got_mdsmap(&mdsc->client->monc, epoch);
2834 mutex_lock(&mdsc->mutex);
2835 if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
2836 dout("handle_map epoch %u <= our %u\n",
2837 epoch, mdsc->mdsmap->m_epoch);
2838 mutex_unlock(&mdsc->mutex);
2839 return;
2840 }
2841
2842 newmap = ceph_mdsmap_decode(&p, end);
2843 if (IS_ERR(newmap)) {
2844 err = PTR_ERR(newmap);
2845 goto bad_unlock;
2846 }
2847
2848 /* swap into place */
2849 if (mdsc->mdsmap) {
2850 oldmap = mdsc->mdsmap;
2851 mdsc->mdsmap = newmap;
2852 check_new_map(mdsc, newmap, oldmap);
2853 ceph_mdsmap_destroy(oldmap);
2854 } else {
2855 mdsc->mdsmap = newmap; /* first mds map */
2856 }
2857 mdsc->client->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
2858
2859 __wake_requests(mdsc, &mdsc->waiting_for_map);
2860
2861 mutex_unlock(&mdsc->mutex);
2862 schedule_delayed(mdsc);
2863 return;
2864
2865bad_unlock:
2866 mutex_unlock(&mdsc->mutex);
2867bad:
2868 pr_err("error decoding mdsmap %d\n", err);
2869 return;
2870}
2871
2872static struct ceph_connection *con_get(struct ceph_connection *con)
2873{
2874 struct ceph_mds_session *s = con->private;
2875
2876 if (get_session(s)) {
2877 dout("mdsc con_get %p ok (%d)\n", s, atomic_read(&s->s_ref));
2878 return con;
2879 }
2880 dout("mdsc con_get %p FAIL\n", s);
2881 return NULL;
2882}
2883
2884static void con_put(struct ceph_connection *con)
2885{
2886 struct ceph_mds_session *s = con->private;
2887
2888 ceph_put_mds_session(s);
2889 dout("mdsc con_put %p (%d)\n", s, atomic_read(&s->s_ref));
2890}
2891
2892/*
2893 * if the client is unresponsive for long enough, the mds will kill
2894 * the session entirely.
2895 */
2896static void peer_reset(struct ceph_connection *con)
2897{
2898 struct ceph_mds_session *s = con->private;
2899
2900 pr_err("mds%d gave us the boot. IMPLEMENT RECONNECT.\n",
2901 s->s_mds);
2902}
2903
2904static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
2905{
2906 struct ceph_mds_session *s = con->private;
2907 struct ceph_mds_client *mdsc = s->s_mdsc;
2908 int type = le16_to_cpu(msg->hdr.type);
2909
2910 mutex_lock(&mdsc->mutex);
2911 if (__verify_registered_session(mdsc, s) < 0) {
2912 mutex_unlock(&mdsc->mutex);
2913 goto out;
2914 }
2915 mutex_unlock(&mdsc->mutex);
2916
2917 switch (type) {
2918 case CEPH_MSG_MDS_MAP:
2919 ceph_mdsc_handle_map(mdsc, msg);
2920 break;
2921 case CEPH_MSG_CLIENT_SESSION:
2922 handle_session(s, msg);
2923 break;
2924 case CEPH_MSG_CLIENT_REPLY:
2925 handle_reply(s, msg);
2926 break;
2927 case CEPH_MSG_CLIENT_REQUEST_FORWARD:
2928 handle_forward(mdsc, s, msg);
2929 break;
2930 case CEPH_MSG_CLIENT_CAPS:
2931 ceph_handle_caps(s, msg);
2932 break;
2933 case CEPH_MSG_CLIENT_SNAP:
2934 ceph_handle_snap(mdsc, s, msg);
2935 break;
2936 case CEPH_MSG_CLIENT_LEASE:
2937 handle_lease(mdsc, s, msg);
2938 break;
2939
2940 default:
2941 pr_err("received unknown message type %d %s\n", type,
2942 ceph_msg_type_name(type));
2943 }
2944out:
2945 ceph_msg_put(msg);
2946}
2947
2948/*
2949 * authentication
2950 */
2951static int get_authorizer(struct ceph_connection *con,
2952 void **buf, int *len, int *proto,
2953 void **reply_buf, int *reply_len, int force_new)
2954{
2955 struct ceph_mds_session *s = con->private;
2956 struct ceph_mds_client *mdsc = s->s_mdsc;
2957 struct ceph_auth_client *ac = mdsc->client->monc.auth;
2958 int ret = 0;
2959
2960 if (force_new && s->s_authorizer) {
2961 ac->ops->destroy_authorizer(ac, s->s_authorizer);
2962 s->s_authorizer = NULL;
2963 }
2964 if (s->s_authorizer == NULL) {
2965 if (ac->ops->create_authorizer) {
2966 ret = ac->ops->create_authorizer(
2967 ac, CEPH_ENTITY_TYPE_MDS,
2968 &s->s_authorizer,
2969 &s->s_authorizer_buf,
2970 &s->s_authorizer_buf_len,
2971 &s->s_authorizer_reply_buf,
2972 &s->s_authorizer_reply_buf_len);
2973 if (ret)
2974 return ret;
2975 }
2976 }
2977
2978 *proto = ac->protocol;
2979 *buf = s->s_authorizer_buf;
2980 *len = s->s_authorizer_buf_len;
2981 *reply_buf = s->s_authorizer_reply_buf;
2982 *reply_len = s->s_authorizer_reply_buf_len;
2983 return 0;
2984}
2985
2986
2987static int verify_authorizer_reply(struct ceph_connection *con, int len)
2988{
2989 struct ceph_mds_session *s = con->private;
2990 struct ceph_mds_client *mdsc = s->s_mdsc;
2991 struct ceph_auth_client *ac = mdsc->client->monc.auth;
2992
2993 return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len);
2994}
2995
2996static int invalidate_authorizer(struct ceph_connection *con)
2997{
2998 struct ceph_mds_session *s = con->private;
2999 struct ceph_mds_client *mdsc = s->s_mdsc;
3000 struct ceph_auth_client *ac = mdsc->client->monc.auth;
3001
3002 if (ac->ops->invalidate_authorizer)
3003 ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
3004
3005 return ceph_monc_validate_auth(&mdsc->client->monc);
3006}
3007
3008const static struct ceph_connection_operations mds_con_ops = {
3009 .get = con_get,
3010 .put = con_put,
3011 .dispatch = dispatch,
3012 .get_authorizer = get_authorizer,
3013 .verify_authorizer_reply = verify_authorizer_reply,
3014 .invalidate_authorizer = invalidate_authorizer,
3015 .peer_reset = peer_reset,
3016};
3017
3018
3019
3020
3021/* eof */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
new file mode 100644
index 000000000000..961cc6f65878
--- /dev/null
+++ b/fs/ceph/mds_client.h
@@ -0,0 +1,335 @@
1#ifndef _FS_CEPH_MDS_CLIENT_H
2#define _FS_CEPH_MDS_CLIENT_H
3
4#include <linux/completion.h>
5#include <linux/kref.h>
6#include <linux/list.h>
7#include <linux/mutex.h>
8#include <linux/rbtree.h>
9#include <linux/spinlock.h>
10
11#include "types.h"
12#include "messenger.h"
13#include "mdsmap.h"
14
15/*
16 * Some lock dependencies:
17 *
18 * session->s_mutex
19 * mdsc->mutex
20 *
21 * mdsc->snap_rwsem
22 *
23 * inode->i_lock
24 * mdsc->snap_flush_lock
25 * mdsc->cap_delay_lock
26 *
27 */
28
29struct ceph_client;
30struct ceph_cap;
31
32/*
33 * parsed info about a single inode. pointers are into the encoded
34 * on-wire structures within the mds reply message payload.
35 */
36struct ceph_mds_reply_info_in {
37 struct ceph_mds_reply_inode *in;
38 u32 symlink_len;
39 char *symlink;
40 u32 xattr_len;
41 char *xattr_data;
42};
43
44/*
45 * parsed info about an mds reply, including information about the
46 * target inode and/or its parent directory and dentry, and directory
47 * contents (for readdir results).
48 */
49struct ceph_mds_reply_info_parsed {
50 struct ceph_mds_reply_head *head;
51
52 struct ceph_mds_reply_info_in diri, targeti;
53 struct ceph_mds_reply_dirfrag *dirfrag;
54 char *dname;
55 u32 dname_len;
56 struct ceph_mds_reply_lease *dlease;
57
58 struct ceph_mds_reply_dirfrag *dir_dir;
59 int dir_nr;
60 char **dir_dname;
61 u32 *dir_dname_len;
62 struct ceph_mds_reply_lease **dir_dlease;
63 struct ceph_mds_reply_info_in *dir_in;
64 u8 dir_complete, dir_end;
65
66 /* encoded blob describing snapshot contexts for certain
67 operations (e.g., open) */
68 void *snapblob;
69 int snapblob_len;
70};
71
72
73/*
74 * cap releases are batched and sent to the MDS en masse.
75 */
76#define CEPH_CAPS_PER_RELEASE ((PAGE_CACHE_SIZE - \
77 sizeof(struct ceph_mds_cap_release)) / \
78 sizeof(struct ceph_mds_cap_item))
79
80
81/*
82 * state associated with each MDS<->client session
83 */
84enum {
85 CEPH_MDS_SESSION_NEW = 1,
86 CEPH_MDS_SESSION_OPENING = 2,
87 CEPH_MDS_SESSION_OPEN = 3,
88 CEPH_MDS_SESSION_HUNG = 4,
89 CEPH_MDS_SESSION_CLOSING = 5,
90 CEPH_MDS_SESSION_RESTARTING = 6,
91 CEPH_MDS_SESSION_RECONNECTING = 7,
92};
93
94struct ceph_mds_session {
95 struct ceph_mds_client *s_mdsc;
96 int s_mds;
97 int s_state;
98 unsigned long s_ttl; /* time until mds kills us */
99 u64 s_seq; /* incoming msg seq # */
100 struct mutex s_mutex; /* serialize session messages */
101
102 struct ceph_connection s_con;
103
104 struct ceph_authorizer *s_authorizer;
105 void *s_authorizer_buf, *s_authorizer_reply_buf;
106 size_t s_authorizer_buf_len, s_authorizer_reply_buf_len;
107
108 /* protected by s_cap_lock */
109 spinlock_t s_cap_lock;
110 u32 s_cap_gen; /* inc each time we get mds stale msg */
111 unsigned long s_cap_ttl; /* when session caps expire */
112 struct list_head s_caps; /* all caps issued by this session */
113 int s_nr_caps, s_trim_caps;
114 int s_num_cap_releases;
115 struct list_head s_cap_releases; /* waiting cap_release messages */
116 struct list_head s_cap_releases_done; /* ready to send */
117 struct ceph_cap *s_cap_iterator;
118
119 /* protected by mutex */
120 struct list_head s_cap_flushing; /* inodes w/ flushing caps */
121 struct list_head s_cap_snaps_flushing;
122 unsigned long s_renew_requested; /* last time we sent a renew req */
123 u64 s_renew_seq;
124
125 atomic_t s_ref;
126 struct list_head s_waiting; /* waiting requests */
127 struct list_head s_unsafe; /* unsafe requests */
128};
129
130/*
131 * modes of choosing which MDS to send a request to
132 */
133enum {
134 USE_ANY_MDS,
135 USE_RANDOM_MDS,
136 USE_AUTH_MDS, /* prefer authoritative mds for this metadata item */
137};
138
139struct ceph_mds_request;
140struct ceph_mds_client;
141
142/*
143 * request completion callback
144 */
145typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc,
146 struct ceph_mds_request *req);
147
148/*
149 * an in-flight mds request
150 */
151struct ceph_mds_request {
152 u64 r_tid; /* transaction id */
153 struct rb_node r_node;
154
155 int r_op; /* mds op code */
156 int r_mds;
157
158 /* operation on what? */
159 struct inode *r_inode; /* arg1 */
160 struct dentry *r_dentry; /* arg1 */
161 struct dentry *r_old_dentry; /* arg2: rename from or link from */
162 char *r_path1, *r_path2;
163 struct ceph_vino r_ino1, r_ino2;
164
165 struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */
166 struct inode *r_target_inode; /* resulting inode */
167
168 union ceph_mds_request_args r_args;
169 int r_fmode; /* file mode, if expecting cap */
170
171 /* for choosing which mds to send this request to */
172 int r_direct_mode;
173 u32 r_direct_hash; /* choose dir frag based on this dentry hash */
174 bool r_direct_is_hash; /* true if r_direct_hash is valid */
175
176 /* data payload is used for xattr ops */
177 struct page **r_pages;
178 int r_num_pages;
179 int r_data_len;
180
181 /* what caps shall we drop? */
182 int r_inode_drop, r_inode_unless;
183 int r_dentry_drop, r_dentry_unless;
184 int r_old_dentry_drop, r_old_dentry_unless;
185 struct inode *r_old_inode;
186 int r_old_inode_drop, r_old_inode_unless;
187
188 struct ceph_msg *r_request; /* original request */
189 struct ceph_msg *r_reply;
190 struct ceph_mds_reply_info_parsed r_reply_info;
191 int r_err;
192 bool r_aborted;
193
194 unsigned long r_timeout; /* optional. jiffies */
195 unsigned long r_started; /* start time to measure timeout against */
196 unsigned long r_request_started; /* start time for mds request only,
197 used to measure lease durations */
198
199 /* link unsafe requests to parent directory, for fsync */
200 struct inode *r_unsafe_dir;
201 struct list_head r_unsafe_dir_item;
202
203 struct ceph_mds_session *r_session;
204
205 int r_attempts; /* resend attempts */
206 int r_num_fwd; /* number of forward attempts */
207 int r_num_stale;
208 int r_resend_mds; /* mds to resend to next, if any*/
209
210 struct kref r_kref;
211 struct list_head r_wait;
212 struct completion r_completion;
213 struct completion r_safe_completion;
214 ceph_mds_request_callback_t r_callback;
215 struct list_head r_unsafe_item; /* per-session unsafe list item */
216 bool r_got_unsafe, r_got_safe;
217
218 bool r_did_prepopulate;
219 u32 r_readdir_offset;
220
221 struct ceph_cap_reservation r_caps_reservation;
222 int r_num_caps;
223};
224
225/*
226 * mds client state
227 */
228struct ceph_mds_client {
229 struct ceph_client *client;
230 struct mutex mutex; /* all nested structures */
231
232 struct ceph_mdsmap *mdsmap;
233 struct completion safe_umount_waiters, session_close_waiters;
234 struct list_head waiting_for_map;
235
236 struct ceph_mds_session **sessions; /* NULL for mds if no session */
237 int max_sessions; /* len of s_mds_sessions */
238 int stopping; /* true if shutting down */
239
240 /*
241 * snap_rwsem will cover cap linkage into snaprealms, and
242 * realm snap contexts. (later, we can do per-realm snap
243 * contexts locks..) the empty list contains realms with no
244 * references (implying they contain no inodes with caps) that
245 * should be destroyed.
246 */
247 struct rw_semaphore snap_rwsem;
248 struct rb_root snap_realms;
249 struct list_head snap_empty;
250 spinlock_t snap_empty_lock; /* protect snap_empty */
251
252 u64 last_tid; /* most recent mds request */
253 struct rb_root request_tree; /* pending mds requests */
254 struct delayed_work delayed_work; /* delayed work */
255 unsigned long last_renew_caps; /* last time we renewed our caps */
256 struct list_head cap_delay_list; /* caps with delayed release */
257 spinlock_t cap_delay_lock; /* protects cap_delay_list */
258 struct list_head snap_flush_list; /* cap_snaps ready to flush */
259 spinlock_t snap_flush_lock;
260
261 u64 cap_flush_seq;
262 struct list_head cap_dirty; /* inodes with dirty caps */
263 int num_cap_flushing; /* # caps we are flushing */
264 spinlock_t cap_dirty_lock; /* protects above items */
265 wait_queue_head_t cap_flushing_wq;
266
267#ifdef CONFIG_DEBUG_FS
268 struct dentry *debugfs_file;
269#endif
270
271 spinlock_t dentry_lru_lock;
272 struct list_head dentry_lru;
273 int num_dentry;
274};
275
276extern const char *ceph_mds_op_name(int op);
277
278extern struct ceph_mds_session *
279__ceph_lookup_mds_session(struct ceph_mds_client *, int mds);
280
281static inline struct ceph_mds_session *
282ceph_get_mds_session(struct ceph_mds_session *s)
283{
284 atomic_inc(&s->s_ref);
285 return s;
286}
287
288extern void ceph_put_mds_session(struct ceph_mds_session *s);
289
290extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
291 struct ceph_msg *msg, int mds);
292
293extern int ceph_mdsc_init(struct ceph_mds_client *mdsc,
294 struct ceph_client *client);
295extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
296extern void ceph_mdsc_stop(struct ceph_mds_client *mdsc);
297
298extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
299
300extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
301 struct inode *inode,
302 struct dentry *dn, int mask);
303
304extern struct ceph_mds_request *
305ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
306extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
307 struct ceph_mds_request *req);
308extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
309 struct inode *dir,
310 struct ceph_mds_request *req);
311static inline void ceph_mdsc_get_request(struct ceph_mds_request *req)
312{
313 kref_get(&req->r_kref);
314}
315extern void ceph_mdsc_release_request(struct kref *kref);
316static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
317{
318 kref_put(&req->r_kref, ceph_mdsc_release_request);
319}
320
321extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
322
323extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
324 int stop_on_nosnap);
325
326extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry);
327extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
328 struct inode *inode,
329 struct dentry *dentry, char action,
330 u32 seq);
331
332extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc,
333 struct ceph_msg *msg);
334
335#endif
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
new file mode 100644
index 000000000000..c4c498e6dfef
--- /dev/null
+++ b/fs/ceph/mdsmap.c
@@ -0,0 +1,174 @@
1#include "ceph_debug.h"
2
3#include <linux/bug.h>
4#include <linux/err.h>
5#include <linux/random.h>
6#include <linux/slab.h>
7#include <linux/types.h>
8
9#include "mdsmap.h"
10#include "messenger.h"
11#include "decode.h"
12
13#include "super.h"
14
15
16/*
17 * choose a random mds that is "up" (i.e. has a state > 0), or -1.
18 */
19int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
20{
21 int n = 0;
22 int i;
23 char r;
24
25 /* count */
26 for (i = 0; i < m->m_max_mds; i++)
27 if (m->m_info[i].state > 0)
28 n++;
29 if (n == 0)
30 return -1;
31
32 /* pick */
33 get_random_bytes(&r, 1);
34 n = r % n;
35 i = 0;
36 for (i = 0; n > 0; i++, n--)
37 while (m->m_info[i].state <= 0)
38 i++;
39
40 return i;
41}
42
43/*
44 * Decode an MDS map
45 *
46 * Ignore any fields we don't care about (there are quite a few of
47 * them).
48 */
49struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
50{
51 struct ceph_mdsmap *m;
52 const void *start = *p;
53 int i, j, n;
54 int err = -EINVAL;
55 u16 version;
56
57 m = kzalloc(sizeof(*m), GFP_NOFS);
58 if (m == NULL)
59 return ERR_PTR(-ENOMEM);
60
61 ceph_decode_16_safe(p, end, version, bad);
62
63 ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);
64 m->m_epoch = ceph_decode_32(p);
65 m->m_client_epoch = ceph_decode_32(p);
66 m->m_last_failure = ceph_decode_32(p);
67 m->m_root = ceph_decode_32(p);
68 m->m_session_timeout = ceph_decode_32(p);
69 m->m_session_autoclose = ceph_decode_32(p);
70 m->m_max_file_size = ceph_decode_64(p);
71 m->m_max_mds = ceph_decode_32(p);
72
73 m->m_info = kcalloc(m->m_max_mds, sizeof(*m->m_info), GFP_NOFS);
74 if (m->m_info == NULL)
75 goto badmem;
76
77 /* pick out active nodes from mds_info (state > 0) */
78 n = ceph_decode_32(p);
79 for (i = 0; i < n; i++) {
80 u64 global_id;
81 u32 namelen;
82 s32 mds, inc, state;
83 u64 state_seq;
84 u8 infoversion;
85 struct ceph_entity_addr addr;
86 u32 num_export_targets;
87 void *pexport_targets = NULL;
88
89 ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad);
90 global_id = ceph_decode_64(p);
91 infoversion = ceph_decode_8(p);
92 *p += sizeof(u64);
93 namelen = ceph_decode_32(p); /* skip mds name */
94 *p += namelen;
95
96 ceph_decode_need(p, end,
97 4*sizeof(u32) + sizeof(u64) +
98 sizeof(addr) + sizeof(struct ceph_timespec),
99 bad);
100 mds = ceph_decode_32(p);
101 inc = ceph_decode_32(p);
102 state = ceph_decode_32(p);
103 state_seq = ceph_decode_64(p);
104 ceph_decode_copy(p, &addr, sizeof(addr));
105 ceph_decode_addr(&addr);
106 *p += sizeof(struct ceph_timespec);
107 *p += sizeof(u32);
108 ceph_decode_32_safe(p, end, namelen, bad);
109 *p += namelen;
110 if (infoversion >= 2) {
111 ceph_decode_32_safe(p, end, num_export_targets, bad);
112 pexport_targets = *p;
113 *p += num_export_targets * sizeof(u32);
114 } else {
115 num_export_targets = 0;
116 }
117
118 dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
119 i+1, n, global_id, mds, inc, pr_addr(&addr.in_addr),
120 ceph_mds_state_name(state));
121 if (mds >= 0 && mds < m->m_max_mds && state > 0) {
122 m->m_info[mds].global_id = global_id;
123 m->m_info[mds].state = state;
124 m->m_info[mds].addr = addr;
125 m->m_info[mds].num_export_targets = num_export_targets;
126 if (num_export_targets) {
127 m->m_info[mds].export_targets =
128 kcalloc(num_export_targets, sizeof(u32),
129 GFP_NOFS);
130 for (j = 0; j < num_export_targets; j++)
131 m->m_info[mds].export_targets[j] =
132 ceph_decode_32(&pexport_targets);
133 } else {
134 m->m_info[mds].export_targets = NULL;
135 }
136 }
137 }
138
139 /* pg_pools */
140 ceph_decode_32_safe(p, end, n, bad);
141 m->m_num_data_pg_pools = n;
142 m->m_data_pg_pools = kcalloc(n, sizeof(u32), GFP_NOFS);
143 if (!m->m_data_pg_pools)
144 goto badmem;
145 ceph_decode_need(p, end, sizeof(u32)*(n+1), bad);
146 for (i = 0; i < n; i++)
147 m->m_data_pg_pools[i] = ceph_decode_32(p);
148 m->m_cas_pg_pool = ceph_decode_32(p);
149
150 /* ok, we don't care about the rest. */
151 dout("mdsmap_decode success epoch %u\n", m->m_epoch);
152 return m;
153
154badmem:
155 err = -ENOMEM;
156bad:
157 pr_err("corrupt mdsmap\n");
158 print_hex_dump(KERN_DEBUG, "mdsmap: ",
159 DUMP_PREFIX_OFFSET, 16, 1,
160 start, end - start, true);
161 ceph_mdsmap_destroy(m);
162 return ERR_PTR(-EINVAL);
163}
164
165void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
166{
167 int i;
168
169 for (i = 0; i < m->m_max_mds; i++)
170 kfree(m->m_info[i].export_targets);
171 kfree(m->m_info);
172 kfree(m->m_data_pg_pools);
173 kfree(m);
174}
diff --git a/fs/ceph/mdsmap.h b/fs/ceph/mdsmap.h
new file mode 100644
index 000000000000..eacc131aa5cb
--- /dev/null
+++ b/fs/ceph/mdsmap.h
@@ -0,0 +1,54 @@
1#ifndef _FS_CEPH_MDSMAP_H
2#define _FS_CEPH_MDSMAP_H
3
4#include "types.h"
5
6/*
7 * mds map - describe servers in the mds cluster.
8 *
9 * we limit fields to those the client actually xcares about
10 */
11struct ceph_mds_info {
12 u64 global_id;
13 struct ceph_entity_addr addr;
14 s32 state;
15 int num_export_targets;
16 u32 *export_targets;
17};
18
19struct ceph_mdsmap {
20 u32 m_epoch, m_client_epoch, m_last_failure;
21 u32 m_root;
22 u32 m_session_timeout; /* seconds */
23 u32 m_session_autoclose; /* seconds */
24 u64 m_max_file_size;
25 u32 m_max_mds; /* size of m_addr, m_state arrays */
26 struct ceph_mds_info *m_info;
27
28 /* which object pools file data can be stored in */
29 int m_num_data_pg_pools;
30 u32 *m_data_pg_pools;
31 u32 m_cas_pg_pool;
32};
33
34static inline struct ceph_entity_addr *
35ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w)
36{
37 if (w >= m->m_max_mds)
38 return NULL;
39 return &m->m_info[w].addr;
40}
41
42static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w)
43{
44 BUG_ON(w < 0);
45 if (w >= m->m_max_mds)
46 return CEPH_MDS_STATE_DNE;
47 return m->m_info[w].state;
48}
49
50extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
51extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end);
52extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
53
54#endif
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
new file mode 100644
index 000000000000..781656a49bf8
--- /dev/null
+++ b/fs/ceph/messenger.c
@@ -0,0 +1,2240 @@
1#include "ceph_debug.h"
2
3#include <linux/crc32c.h>
4#include <linux/ctype.h>
5#include <linux/highmem.h>
6#include <linux/inet.h>
7#include <linux/kthread.h>
8#include <linux/net.h>
9#include <linux/socket.h>
10#include <linux/string.h>
11#include <net/tcp.h>
12
13#include "super.h"
14#include "messenger.h"
15#include "decode.h"
16#include "pagelist.h"
17
18/*
19 * Ceph uses the messenger to exchange ceph_msg messages with other
20 * hosts in the system. The messenger provides ordered and reliable
21 * delivery. We tolerate TCP disconnects by reconnecting (with
22 * exponential backoff) in the case of a fault (disconnection, bad
23 * crc, protocol error). Acks allow sent messages to be discarded by
24 * the sender.
25 */
26
27/* static tag bytes (protocol control messages) */
28static char tag_msg = CEPH_MSGR_TAG_MSG;
29static char tag_ack = CEPH_MSGR_TAG_ACK;
30static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
31
32
33static void queue_con(struct ceph_connection *con);
34static void con_work(struct work_struct *);
35static void ceph_fault(struct ceph_connection *con);
36
37const char *ceph_name_type_str(int t)
38{
39 switch (t) {
40 case CEPH_ENTITY_TYPE_MON: return "mon";
41 case CEPH_ENTITY_TYPE_MDS: return "mds";
42 case CEPH_ENTITY_TYPE_OSD: return "osd";
43 case CEPH_ENTITY_TYPE_CLIENT: return "client";
44 case CEPH_ENTITY_TYPE_ADMIN: return "admin";
45 default: return "???";
46 }
47}
48
49/*
50 * nicely render a sockaddr as a string.
51 */
52#define MAX_ADDR_STR 20
53static char addr_str[MAX_ADDR_STR][40];
54static DEFINE_SPINLOCK(addr_str_lock);
55static int last_addr_str;
56
57const char *pr_addr(const struct sockaddr_storage *ss)
58{
59 int i;
60 char *s;
61 struct sockaddr_in *in4 = (void *)ss;
62 unsigned char *quad = (void *)&in4->sin_addr.s_addr;
63 struct sockaddr_in6 *in6 = (void *)ss;
64
65 spin_lock(&addr_str_lock);
66 i = last_addr_str++;
67 if (last_addr_str == MAX_ADDR_STR)
68 last_addr_str = 0;
69 spin_unlock(&addr_str_lock);
70 s = addr_str[i];
71
72 switch (ss->ss_family) {
73 case AF_INET:
74 sprintf(s, "%u.%u.%u.%u:%u",
75 (unsigned int)quad[0],
76 (unsigned int)quad[1],
77 (unsigned int)quad[2],
78 (unsigned int)quad[3],
79 (unsigned int)ntohs(in4->sin_port));
80 break;
81
82 case AF_INET6:
83 sprintf(s, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%u",
84 in6->sin6_addr.s6_addr16[0],
85 in6->sin6_addr.s6_addr16[1],
86 in6->sin6_addr.s6_addr16[2],
87 in6->sin6_addr.s6_addr16[3],
88 in6->sin6_addr.s6_addr16[4],
89 in6->sin6_addr.s6_addr16[5],
90 in6->sin6_addr.s6_addr16[6],
91 in6->sin6_addr.s6_addr16[7],
92 (unsigned int)ntohs(in6->sin6_port));
93 break;
94
95 default:
96 sprintf(s, "(unknown sockaddr family %d)", (int)ss->ss_family);
97 }
98
99 return s;
100}
101
102static void encode_my_addr(struct ceph_messenger *msgr)
103{
104 memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
105 ceph_encode_addr(&msgr->my_enc_addr);
106}
107
108/*
109 * work queue for all reading and writing to/from the socket.
110 */
111struct workqueue_struct *ceph_msgr_wq;
112
113int __init ceph_msgr_init(void)
114{
115 ceph_msgr_wq = create_workqueue("ceph-msgr");
116 if (IS_ERR(ceph_msgr_wq)) {
117 int ret = PTR_ERR(ceph_msgr_wq);
118 pr_err("msgr_init failed to create workqueue: %d\n", ret);
119 ceph_msgr_wq = NULL;
120 return ret;
121 }
122 return 0;
123}
124
125void ceph_msgr_exit(void)
126{
127 destroy_workqueue(ceph_msgr_wq);
128}
129
130/*
131 * socket callback functions
132 */
133
134/* data available on socket, or listen socket received a connect */
135static void ceph_data_ready(struct sock *sk, int count_unused)
136{
137 struct ceph_connection *con =
138 (struct ceph_connection *)sk->sk_user_data;
139 if (sk->sk_state != TCP_CLOSE_WAIT) {
140 dout("ceph_data_ready on %p state = %lu, queueing work\n",
141 con, con->state);
142 queue_con(con);
143 }
144}
145
146/* socket has buffer space for writing */
147static void ceph_write_space(struct sock *sk)
148{
149 struct ceph_connection *con =
150 (struct ceph_connection *)sk->sk_user_data;
151
152 /* only queue to workqueue if there is data we want to write. */
153 if (test_bit(WRITE_PENDING, &con->state)) {
154 dout("ceph_write_space %p queueing write work\n", con);
155 queue_con(con);
156 } else {
157 dout("ceph_write_space %p nothing to write\n", con);
158 }
159
160 /* since we have our own write_space, clear the SOCK_NOSPACE flag */
161 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
162}
163
164/* socket's state has changed */
165static void ceph_state_change(struct sock *sk)
166{
167 struct ceph_connection *con =
168 (struct ceph_connection *)sk->sk_user_data;
169
170 dout("ceph_state_change %p state = %lu sk_state = %u\n",
171 con, con->state, sk->sk_state);
172
173 if (test_bit(CLOSED, &con->state))
174 return;
175
176 switch (sk->sk_state) {
177 case TCP_CLOSE:
178 dout("ceph_state_change TCP_CLOSE\n");
179 case TCP_CLOSE_WAIT:
180 dout("ceph_state_change TCP_CLOSE_WAIT\n");
181 if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) {
182 if (test_bit(CONNECTING, &con->state))
183 con->error_msg = "connection failed";
184 else
185 con->error_msg = "socket closed";
186 queue_con(con);
187 }
188 break;
189 case TCP_ESTABLISHED:
190 dout("ceph_state_change TCP_ESTABLISHED\n");
191 queue_con(con);
192 break;
193 }
194}
195
196/*
197 * set up socket callbacks
198 */
199static void set_sock_callbacks(struct socket *sock,
200 struct ceph_connection *con)
201{
202 struct sock *sk = sock->sk;
203 sk->sk_user_data = (void *)con;
204 sk->sk_data_ready = ceph_data_ready;
205 sk->sk_write_space = ceph_write_space;
206 sk->sk_state_change = ceph_state_change;
207}
208
209
210/*
211 * socket helpers
212 */
213
214/*
215 * initiate connection to a remote socket.
216 */
217static struct socket *ceph_tcp_connect(struct ceph_connection *con)
218{
219 struct sockaddr *paddr = (struct sockaddr *)&con->peer_addr.in_addr;
220 struct socket *sock;
221 int ret;
222
223 BUG_ON(con->sock);
224 ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
225 if (ret)
226 return ERR_PTR(ret);
227 con->sock = sock;
228 sock->sk->sk_allocation = GFP_NOFS;
229
230 set_sock_callbacks(sock, con);
231
232 dout("connect %s\n", pr_addr(&con->peer_addr.in_addr));
233
234 ret = sock->ops->connect(sock, paddr, sizeof(*paddr), O_NONBLOCK);
235 if (ret == -EINPROGRESS) {
236 dout("connect %s EINPROGRESS sk_state = %u\n",
237 pr_addr(&con->peer_addr.in_addr),
238 sock->sk->sk_state);
239 ret = 0;
240 }
241 if (ret < 0) {
242 pr_err("connect %s error %d\n",
243 pr_addr(&con->peer_addr.in_addr), ret);
244 sock_release(sock);
245 con->sock = NULL;
246 con->error_msg = "connect error";
247 }
248
249 if (ret < 0)
250 return ERR_PTR(ret);
251 return sock;
252}
253
254static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
255{
256 struct kvec iov = {buf, len};
257 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
258
259 return kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags);
260}
261
262/*
263 * write something. @more is true if caller will be sending more data
264 * shortly.
265 */
266static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
267 size_t kvlen, size_t len, int more)
268{
269 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
270
271 if (more)
272 msg.msg_flags |= MSG_MORE;
273 else
274 msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */
275
276 return kernel_sendmsg(sock, &msg, iov, kvlen, len);
277}
278
279
280/*
281 * Shutdown/close the socket for the given connection.
282 */
283static int con_close_socket(struct ceph_connection *con)
284{
285 int rc;
286
287 dout("con_close_socket on %p sock %p\n", con, con->sock);
288 if (!con->sock)
289 return 0;
290 set_bit(SOCK_CLOSED, &con->state);
291 rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
292 sock_release(con->sock);
293 con->sock = NULL;
294 clear_bit(SOCK_CLOSED, &con->state);
295 return rc;
296}
297
298/*
299 * Reset a connection. Discard all incoming and outgoing messages
300 * and clear *_seq state.
301 */
302static void ceph_msg_remove(struct ceph_msg *msg)
303{
304 list_del_init(&msg->list_head);
305 ceph_msg_put(msg);
306}
307static void ceph_msg_remove_list(struct list_head *head)
308{
309 while (!list_empty(head)) {
310 struct ceph_msg *msg = list_first_entry(head, struct ceph_msg,
311 list_head);
312 ceph_msg_remove(msg);
313 }
314}
315
316static void reset_connection(struct ceph_connection *con)
317{
318 /* reset connection, out_queue, msg_ and connect_seq */
319 /* discard existing out_queue and msg_seq */
320 ceph_msg_remove_list(&con->out_queue);
321 ceph_msg_remove_list(&con->out_sent);
322
323 if (con->in_msg) {
324 ceph_msg_put(con->in_msg);
325 con->in_msg = NULL;
326 }
327
328 con->connect_seq = 0;
329 con->out_seq = 0;
330 if (con->out_msg) {
331 ceph_msg_put(con->out_msg);
332 con->out_msg = NULL;
333 }
334 con->in_seq = 0;
335}
336
337/*
338 * mark a peer down. drop any open connections.
339 */
340void ceph_con_close(struct ceph_connection *con)
341{
342 dout("con_close %p peer %s\n", con, pr_addr(&con->peer_addr.in_addr));
343 set_bit(CLOSED, &con->state); /* in case there's queued work */
344 clear_bit(STANDBY, &con->state); /* avoid connect_seq bump */
345 clear_bit(LOSSYTX, &con->state); /* so we retry next connect */
346 clear_bit(KEEPALIVE_PENDING, &con->state);
347 clear_bit(WRITE_PENDING, &con->state);
348 mutex_lock(&con->mutex);
349 reset_connection(con);
350 cancel_delayed_work(&con->work);
351 mutex_unlock(&con->mutex);
352 queue_con(con);
353}
354
355/*
356 * Reopen a closed connection, with a new peer address.
357 */
358void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr)
359{
360 dout("con_open %p %s\n", con, pr_addr(&addr->in_addr));
361 set_bit(OPENING, &con->state);
362 clear_bit(CLOSED, &con->state);
363 memcpy(&con->peer_addr, addr, sizeof(*addr));
364 con->delay = 0; /* reset backoff memory */
365 queue_con(con);
366}
367
368/*
369 * generic get/put
370 */
371struct ceph_connection *ceph_con_get(struct ceph_connection *con)
372{
373 dout("con_get %p nref = %d -> %d\n", con,
374 atomic_read(&con->nref), atomic_read(&con->nref) + 1);
375 if (atomic_inc_not_zero(&con->nref))
376 return con;
377 return NULL;
378}
379
380void ceph_con_put(struct ceph_connection *con)
381{
382 dout("con_put %p nref = %d -> %d\n", con,
383 atomic_read(&con->nref), atomic_read(&con->nref) - 1);
384 BUG_ON(atomic_read(&con->nref) == 0);
385 if (atomic_dec_and_test(&con->nref)) {
386 BUG_ON(con->sock);
387 kfree(con);
388 }
389}
390
391/*
392 * initialize a new connection.
393 */
394void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con)
395{
396 dout("con_init %p\n", con);
397 memset(con, 0, sizeof(*con));
398 atomic_set(&con->nref, 1);
399 con->msgr = msgr;
400 mutex_init(&con->mutex);
401 INIT_LIST_HEAD(&con->out_queue);
402 INIT_LIST_HEAD(&con->out_sent);
403 INIT_DELAYED_WORK(&con->work, con_work);
404}
405
406
407/*
408 * We maintain a global counter to order connection attempts. Get
409 * a unique seq greater than @gt.
410 */
411static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
412{
413 u32 ret;
414
415 spin_lock(&msgr->global_seq_lock);
416 if (msgr->global_seq < gt)
417 msgr->global_seq = gt;
418 ret = ++msgr->global_seq;
419 spin_unlock(&msgr->global_seq_lock);
420 return ret;
421}
422
423
424/*
425 * Prepare footer for currently outgoing message, and finish things
426 * off. Assumes out_kvec* are already valid.. we just add on to the end.
427 */
428static void prepare_write_message_footer(struct ceph_connection *con, int v)
429{
430 struct ceph_msg *m = con->out_msg;
431
432 dout("prepare_write_message_footer %p\n", con);
433 con->out_kvec_is_msg = true;
434 con->out_kvec[v].iov_base = &m->footer;
435 con->out_kvec[v].iov_len = sizeof(m->footer);
436 con->out_kvec_bytes += sizeof(m->footer);
437 con->out_kvec_left++;
438 con->out_more = m->more_to_follow;
439 con->out_msg_done = true;
440}
441
442/*
443 * Prepare headers for the next outgoing message.
444 */
445static void prepare_write_message(struct ceph_connection *con)
446{
447 struct ceph_msg *m;
448 int v = 0;
449
450 con->out_kvec_bytes = 0;
451 con->out_kvec_is_msg = true;
452 con->out_msg_done = false;
453
454 /* Sneak an ack in there first? If we can get it into the same
455 * TCP packet that's a good thing. */
456 if (con->in_seq > con->in_seq_acked) {
457 con->in_seq_acked = con->in_seq;
458 con->out_kvec[v].iov_base = &tag_ack;
459 con->out_kvec[v++].iov_len = 1;
460 con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
461 con->out_kvec[v].iov_base = &con->out_temp_ack;
462 con->out_kvec[v++].iov_len = sizeof(con->out_temp_ack);
463 con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
464 }
465
466 m = list_first_entry(&con->out_queue,
467 struct ceph_msg, list_head);
468 con->out_msg = m;
469 if (test_bit(LOSSYTX, &con->state)) {
470 list_del_init(&m->list_head);
471 } else {
472 /* put message on sent list */
473 ceph_msg_get(m);
474 list_move_tail(&m->list_head, &con->out_sent);
475 }
476
477 m->hdr.seq = cpu_to_le64(++con->out_seq);
478
479 dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n",
480 m, con->out_seq, le16_to_cpu(m->hdr.type),
481 le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
482 le32_to_cpu(m->hdr.data_len),
483 m->nr_pages);
484 BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
485
486 /* tag + hdr + front + middle */
487 con->out_kvec[v].iov_base = &tag_msg;
488 con->out_kvec[v++].iov_len = 1;
489 con->out_kvec[v].iov_base = &m->hdr;
490 con->out_kvec[v++].iov_len = sizeof(m->hdr);
491 con->out_kvec[v++] = m->front;
492 if (m->middle)
493 con->out_kvec[v++] = m->middle->vec;
494 con->out_kvec_left = v;
495 con->out_kvec_bytes += 1 + sizeof(m->hdr) + m->front.iov_len +
496 (m->middle ? m->middle->vec.iov_len : 0);
497 con->out_kvec_cur = con->out_kvec;
498
499 /* fill in crc (except data pages), footer */
500 con->out_msg->hdr.crc =
501 cpu_to_le32(crc32c(0, (void *)&m->hdr,
502 sizeof(m->hdr) - sizeof(m->hdr.crc)));
503 con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE;
504 con->out_msg->footer.front_crc =
505 cpu_to_le32(crc32c(0, m->front.iov_base, m->front.iov_len));
506 if (m->middle)
507 con->out_msg->footer.middle_crc =
508 cpu_to_le32(crc32c(0, m->middle->vec.iov_base,
509 m->middle->vec.iov_len));
510 else
511 con->out_msg->footer.middle_crc = 0;
512 con->out_msg->footer.data_crc = 0;
513 dout("prepare_write_message front_crc %u data_crc %u\n",
514 le32_to_cpu(con->out_msg->footer.front_crc),
515 le32_to_cpu(con->out_msg->footer.middle_crc));
516
517 /* is there a data payload? */
518 if (le32_to_cpu(m->hdr.data_len) > 0) {
519 /* initialize page iterator */
520 con->out_msg_pos.page = 0;
521 con->out_msg_pos.page_pos =
522 le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK;
523 con->out_msg_pos.data_pos = 0;
524 con->out_msg_pos.did_page_crc = 0;
525 con->out_more = 1; /* data + footer will follow */
526 } else {
527 /* no, queue up footer too and be done */
528 prepare_write_message_footer(con, v);
529 }
530
531 set_bit(WRITE_PENDING, &con->state);
532}
533
534/*
535 * Prepare an ack.
536 */
537static void prepare_write_ack(struct ceph_connection *con)
538{
539 dout("prepare_write_ack %p %llu -> %llu\n", con,
540 con->in_seq_acked, con->in_seq);
541 con->in_seq_acked = con->in_seq;
542
543 con->out_kvec[0].iov_base = &tag_ack;
544 con->out_kvec[0].iov_len = 1;
545 con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
546 con->out_kvec[1].iov_base = &con->out_temp_ack;
547 con->out_kvec[1].iov_len = sizeof(con->out_temp_ack);
548 con->out_kvec_left = 2;
549 con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
550 con->out_kvec_cur = con->out_kvec;
551 con->out_more = 1; /* more will follow.. eventually.. */
552 set_bit(WRITE_PENDING, &con->state);
553}
554
555/*
556 * Prepare to write keepalive byte.
557 */
558static void prepare_write_keepalive(struct ceph_connection *con)
559{
560 dout("prepare_write_keepalive %p\n", con);
561 con->out_kvec[0].iov_base = &tag_keepalive;
562 con->out_kvec[0].iov_len = 1;
563 con->out_kvec_left = 1;
564 con->out_kvec_bytes = 1;
565 con->out_kvec_cur = con->out_kvec;
566 set_bit(WRITE_PENDING, &con->state);
567}
568
569/*
570 * Connection negotiation.
571 */
572
573static void prepare_connect_authorizer(struct ceph_connection *con)
574{
575 void *auth_buf;
576 int auth_len = 0;
577 int auth_protocol = 0;
578
579 mutex_unlock(&con->mutex);
580 if (con->ops->get_authorizer)
581 con->ops->get_authorizer(con, &auth_buf, &auth_len,
582 &auth_protocol, &con->auth_reply_buf,
583 &con->auth_reply_buf_len,
584 con->auth_retry);
585 mutex_lock(&con->mutex);
586
587 con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol);
588 con->out_connect.authorizer_len = cpu_to_le32(auth_len);
589
590 con->out_kvec[con->out_kvec_left].iov_base = auth_buf;
591 con->out_kvec[con->out_kvec_left].iov_len = auth_len;
592 con->out_kvec_left++;
593 con->out_kvec_bytes += auth_len;
594}
595
596/*
597 * We connected to a peer and are saying hello.
598 */
599static void prepare_write_banner(struct ceph_messenger *msgr,
600 struct ceph_connection *con)
601{
602 int len = strlen(CEPH_BANNER);
603
604 con->out_kvec[0].iov_base = CEPH_BANNER;
605 con->out_kvec[0].iov_len = len;
606 con->out_kvec[1].iov_base = &msgr->my_enc_addr;
607 con->out_kvec[1].iov_len = sizeof(msgr->my_enc_addr);
608 con->out_kvec_left = 2;
609 con->out_kvec_bytes = len + sizeof(msgr->my_enc_addr);
610 con->out_kvec_cur = con->out_kvec;
611 con->out_more = 0;
612 set_bit(WRITE_PENDING, &con->state);
613}
614
615static void prepare_write_connect(struct ceph_messenger *msgr,
616 struct ceph_connection *con,
617 int after_banner)
618{
619 unsigned global_seq = get_global_seq(con->msgr, 0);
620 int proto;
621
622 switch (con->peer_name.type) {
623 case CEPH_ENTITY_TYPE_MON:
624 proto = CEPH_MONC_PROTOCOL;
625 break;
626 case CEPH_ENTITY_TYPE_OSD:
627 proto = CEPH_OSDC_PROTOCOL;
628 break;
629 case CEPH_ENTITY_TYPE_MDS:
630 proto = CEPH_MDSC_PROTOCOL;
631 break;
632 default:
633 BUG();
634 }
635
636 dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
637 con->connect_seq, global_seq, proto);
638
639 con->out_connect.features = CEPH_FEATURE_SUPPORTED;
640 con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
641 con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
642 con->out_connect.global_seq = cpu_to_le32(global_seq);
643 con->out_connect.protocol_version = cpu_to_le32(proto);
644 con->out_connect.flags = 0;
645
646 if (!after_banner) {
647 con->out_kvec_left = 0;
648 con->out_kvec_bytes = 0;
649 }
650 con->out_kvec[con->out_kvec_left].iov_base = &con->out_connect;
651 con->out_kvec[con->out_kvec_left].iov_len = sizeof(con->out_connect);
652 con->out_kvec_left++;
653 con->out_kvec_bytes += sizeof(con->out_connect);
654 con->out_kvec_cur = con->out_kvec;
655 con->out_more = 0;
656 set_bit(WRITE_PENDING, &con->state);
657
658 prepare_connect_authorizer(con);
659}
660
661
662/*
663 * write as much of pending kvecs to the socket as we can.
664 * 1 -> done
665 * 0 -> socket full, but more to do
666 * <0 -> error
667 */
668static int write_partial_kvec(struct ceph_connection *con)
669{
670 int ret;
671
672 dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes);
673 while (con->out_kvec_bytes > 0) {
674 ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur,
675 con->out_kvec_left, con->out_kvec_bytes,
676 con->out_more);
677 if (ret <= 0)
678 goto out;
679 con->out_kvec_bytes -= ret;
680 if (con->out_kvec_bytes == 0)
681 break; /* done */
682 while (ret > 0) {
683 if (ret >= con->out_kvec_cur->iov_len) {
684 ret -= con->out_kvec_cur->iov_len;
685 con->out_kvec_cur++;
686 con->out_kvec_left--;
687 } else {
688 con->out_kvec_cur->iov_len -= ret;
689 con->out_kvec_cur->iov_base += ret;
690 ret = 0;
691 break;
692 }
693 }
694 }
695 con->out_kvec_left = 0;
696 con->out_kvec_is_msg = false;
697 ret = 1;
698out:
699 dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
700 con->out_kvec_bytes, con->out_kvec_left, ret);
701 return ret; /* done! */
702}
703
704/*
705 * Write as much message data payload as we can. If we finish, queue
706 * up the footer.
707 * 1 -> done, footer is now queued in out_kvec[].
708 * 0 -> socket full, but more to do
709 * <0 -> error
710 */
711static int write_partial_msg_pages(struct ceph_connection *con)
712{
713 struct ceph_msg *msg = con->out_msg;
714 unsigned data_len = le32_to_cpu(msg->hdr.data_len);
715 size_t len;
716 int crc = con->msgr->nocrc;
717 int ret;
718
719 dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n",
720 con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages,
721 con->out_msg_pos.page_pos);
722
723 while (con->out_msg_pos.page < con->out_msg->nr_pages) {
724 struct page *page = NULL;
725 void *kaddr = NULL;
726
727 /*
728 * if we are calculating the data crc (the default), we need
729 * to map the page. if our pages[] has been revoked, use the
730 * zero page.
731 */
732 if (msg->pages) {
733 page = msg->pages[con->out_msg_pos.page];
734 if (crc)
735 kaddr = kmap(page);
736 } else if (msg->pagelist) {
737 page = list_first_entry(&msg->pagelist->head,
738 struct page, lru);
739 if (crc)
740 kaddr = kmap(page);
741 } else {
742 page = con->msgr->zero_page;
743 if (crc)
744 kaddr = page_address(con->msgr->zero_page);
745 }
746 len = min((int)(PAGE_SIZE - con->out_msg_pos.page_pos),
747 (int)(data_len - con->out_msg_pos.data_pos));
748 if (crc && !con->out_msg_pos.did_page_crc) {
749 void *base = kaddr + con->out_msg_pos.page_pos;
750 u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc);
751
752 BUG_ON(kaddr == NULL);
753 con->out_msg->footer.data_crc =
754 cpu_to_le32(crc32c(tmpcrc, base, len));
755 con->out_msg_pos.did_page_crc = 1;
756 }
757
758 ret = kernel_sendpage(con->sock, page,
759 con->out_msg_pos.page_pos, len,
760 MSG_DONTWAIT | MSG_NOSIGNAL |
761 MSG_MORE);
762
763 if (crc && (msg->pages || msg->pagelist))
764 kunmap(page);
765
766 if (ret <= 0)
767 goto out;
768
769 con->out_msg_pos.data_pos += ret;
770 con->out_msg_pos.page_pos += ret;
771 if (ret == len) {
772 con->out_msg_pos.page_pos = 0;
773 con->out_msg_pos.page++;
774 con->out_msg_pos.did_page_crc = 0;
775 if (msg->pagelist)
776 list_move_tail(&page->lru,
777 &msg->pagelist->head);
778 }
779 }
780
781 dout("write_partial_msg_pages %p msg %p done\n", con, msg);
782
783 /* prepare and queue up footer, too */
784 if (!crc)
785 con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
786 con->out_kvec_bytes = 0;
787 con->out_kvec_left = 0;
788 con->out_kvec_cur = con->out_kvec;
789 prepare_write_message_footer(con, 0);
790 ret = 1;
791out:
792 return ret;
793}
794
795/*
796 * write some zeros
797 */
798static int write_partial_skip(struct ceph_connection *con)
799{
800 int ret;
801
802 while (con->out_skip > 0) {
803 struct kvec iov = {
804 .iov_base = page_address(con->msgr->zero_page),
805 .iov_len = min(con->out_skip, (int)PAGE_CACHE_SIZE)
806 };
807
808 ret = ceph_tcp_sendmsg(con->sock, &iov, 1, iov.iov_len, 1);
809 if (ret <= 0)
810 goto out;
811 con->out_skip -= ret;
812 }
813 ret = 1;
814out:
815 return ret;
816}
817
818/*
819 * Prepare to read connection handshake, or an ack.
820 */
821static void prepare_read_banner(struct ceph_connection *con)
822{
823 dout("prepare_read_banner %p\n", con);
824 con->in_base_pos = 0;
825}
826
827static void prepare_read_connect(struct ceph_connection *con)
828{
829 dout("prepare_read_connect %p\n", con);
830 con->in_base_pos = 0;
831}
832
833static void prepare_read_connect_retry(struct ceph_connection *con)
834{
835 dout("prepare_read_connect_retry %p\n", con);
836 con->in_base_pos = strlen(CEPH_BANNER) + sizeof(con->actual_peer_addr)
837 + sizeof(con->peer_addr_for_me);
838}
839
840static void prepare_read_ack(struct ceph_connection *con)
841{
842 dout("prepare_read_ack %p\n", con);
843 con->in_base_pos = 0;
844}
845
846static void prepare_read_tag(struct ceph_connection *con)
847{
848 dout("prepare_read_tag %p\n", con);
849 con->in_base_pos = 0;
850 con->in_tag = CEPH_MSGR_TAG_READY;
851}
852
853/*
854 * Prepare to read a message.
855 */
856static int prepare_read_message(struct ceph_connection *con)
857{
858 dout("prepare_read_message %p\n", con);
859 BUG_ON(con->in_msg != NULL);
860 con->in_base_pos = 0;
861 con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
862 return 0;
863}
864
865
866static int read_partial(struct ceph_connection *con,
867 int *to, int size, void *object)
868{
869 *to += size;
870 while (con->in_base_pos < *to) {
871 int left = *to - con->in_base_pos;
872 int have = size - left;
873 int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
874 if (ret <= 0)
875 return ret;
876 con->in_base_pos += ret;
877 }
878 return 1;
879}
880
881
882/*
883 * Read all or part of the connect-side handshake on a new connection
884 */
885static int read_partial_banner(struct ceph_connection *con)
886{
887 int ret, to = 0;
888
889 dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
890
891 /* peer's banner */
892 ret = read_partial(con, &to, strlen(CEPH_BANNER), con->in_banner);
893 if (ret <= 0)
894 goto out;
895 ret = read_partial(con, &to, sizeof(con->actual_peer_addr),
896 &con->actual_peer_addr);
897 if (ret <= 0)
898 goto out;
899 ret = read_partial(con, &to, sizeof(con->peer_addr_for_me),
900 &con->peer_addr_for_me);
901 if (ret <= 0)
902 goto out;
903out:
904 return ret;
905}
906
907static int read_partial_connect(struct ceph_connection *con)
908{
909 int ret, to = 0;
910
911 dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
912
913 ret = read_partial(con, &to, sizeof(con->in_reply), &con->in_reply);
914 if (ret <= 0)
915 goto out;
916 ret = read_partial(con, &to, le32_to_cpu(con->in_reply.authorizer_len),
917 con->auth_reply_buf);
918 if (ret <= 0)
919 goto out;
920
921 dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
922 con, (int)con->in_reply.tag,
923 le32_to_cpu(con->in_reply.connect_seq),
924 le32_to_cpu(con->in_reply.global_seq));
925out:
926 return ret;
927
928}
929
930/*
931 * Verify the hello banner looks okay.
932 */
933static int verify_hello(struct ceph_connection *con)
934{
935 if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
936 pr_err("connect to %s got bad banner\n",
937 pr_addr(&con->peer_addr.in_addr));
938 con->error_msg = "protocol error, bad banner";
939 return -1;
940 }
941 return 0;
942}
943
944static bool addr_is_blank(struct sockaddr_storage *ss)
945{
946 switch (ss->ss_family) {
947 case AF_INET:
948 return ((struct sockaddr_in *)ss)->sin_addr.s_addr == 0;
949 case AF_INET6:
950 return
951 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[0] == 0 &&
952 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[1] == 0 &&
953 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[2] == 0 &&
954 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[3] == 0;
955 }
956 return false;
957}
958
959static int addr_port(struct sockaddr_storage *ss)
960{
961 switch (ss->ss_family) {
962 case AF_INET:
963 return ntohs(((struct sockaddr_in *)ss)->sin_port);
964 case AF_INET6:
965 return ntohs(((struct sockaddr_in6 *)ss)->sin6_port);
966 }
967 return 0;
968}
969
970static void addr_set_port(struct sockaddr_storage *ss, int p)
971{
972 switch (ss->ss_family) {
973 case AF_INET:
974 ((struct sockaddr_in *)ss)->sin_port = htons(p);
975 case AF_INET6:
976 ((struct sockaddr_in6 *)ss)->sin6_port = htons(p);
977 }
978}
979
980/*
981 * Parse an ip[:port] list into an addr array. Use the default
982 * monitor port if a port isn't specified.
983 */
984int ceph_parse_ips(const char *c, const char *end,
985 struct ceph_entity_addr *addr,
986 int max_count, int *count)
987{
988 int i;
989 const char *p = c;
990
991 dout("parse_ips on '%.*s'\n", (int)(end-c), c);
992 for (i = 0; i < max_count; i++) {
993 const char *ipend;
994 struct sockaddr_storage *ss = &addr[i].in_addr;
995 struct sockaddr_in *in4 = (void *)ss;
996 struct sockaddr_in6 *in6 = (void *)ss;
997 int port;
998
999 memset(ss, 0, sizeof(*ss));
1000 if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr,
1001 ',', &ipend)) {
1002 ss->ss_family = AF_INET;
1003 } else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr,
1004 ',', &ipend)) {
1005 ss->ss_family = AF_INET6;
1006 } else {
1007 goto bad;
1008 }
1009 p = ipend;
1010
1011 /* port? */
1012 if (p < end && *p == ':') {
1013 port = 0;
1014 p++;
1015 while (p < end && *p >= '0' && *p <= '9') {
1016 port = (port * 10) + (*p - '0');
1017 p++;
1018 }
1019 if (port > 65535 || port == 0)
1020 goto bad;
1021 } else {
1022 port = CEPH_MON_PORT;
1023 }
1024
1025 addr_set_port(ss, port);
1026
1027 dout("parse_ips got %s\n", pr_addr(ss));
1028
1029 if (p == end)
1030 break;
1031 if (*p != ',')
1032 goto bad;
1033 p++;
1034 }
1035
1036 if (p != end)
1037 goto bad;
1038
1039 if (count)
1040 *count = i + 1;
1041 return 0;
1042
1043bad:
1044 pr_err("parse_ips bad ip '%s'\n", c);
1045 return -EINVAL;
1046}
1047
1048static int process_banner(struct ceph_connection *con)
1049{
1050 dout("process_banner on %p\n", con);
1051
1052 if (verify_hello(con) < 0)
1053 return -1;
1054
1055 ceph_decode_addr(&con->actual_peer_addr);
1056 ceph_decode_addr(&con->peer_addr_for_me);
1057
1058 /*
1059 * Make sure the other end is who we wanted. note that the other
1060 * end may not yet know their ip address, so if it's 0.0.0.0, give
1061 * them the benefit of the doubt.
1062 */
1063 if (memcmp(&con->peer_addr, &con->actual_peer_addr,
1064 sizeof(con->peer_addr)) != 0 &&
1065 !(addr_is_blank(&con->actual_peer_addr.in_addr) &&
1066 con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
1067 pr_warning("wrong peer, want %s/%lld, got %s/%lld\n",
1068 pr_addr(&con->peer_addr.in_addr),
1069 le64_to_cpu(con->peer_addr.nonce),
1070 pr_addr(&con->actual_peer_addr.in_addr),
1071 le64_to_cpu(con->actual_peer_addr.nonce));
1072 con->error_msg = "wrong peer at address";
1073 return -1;
1074 }
1075
1076 /*
1077 * did we learn our address?
1078 */
1079 if (addr_is_blank(&con->msgr->inst.addr.in_addr)) {
1080 int port = addr_port(&con->msgr->inst.addr.in_addr);
1081
1082 memcpy(&con->msgr->inst.addr.in_addr,
1083 &con->peer_addr_for_me.in_addr,
1084 sizeof(con->peer_addr_for_me.in_addr));
1085 addr_set_port(&con->msgr->inst.addr.in_addr, port);
1086 encode_my_addr(con->msgr);
1087 dout("process_banner learned my addr is %s\n",
1088 pr_addr(&con->msgr->inst.addr.in_addr));
1089 }
1090
1091 set_bit(NEGOTIATING, &con->state);
1092 prepare_read_connect(con);
1093 return 0;
1094}
1095
1096static void fail_protocol(struct ceph_connection *con)
1097{
1098 reset_connection(con);
1099 set_bit(CLOSED, &con->state); /* in case there's queued work */
1100
1101 mutex_unlock(&con->mutex);
1102 if (con->ops->bad_proto)
1103 con->ops->bad_proto(con);
1104 mutex_lock(&con->mutex);
1105}
1106
1107static int process_connect(struct ceph_connection *con)
1108{
1109 u64 sup_feat = CEPH_FEATURE_SUPPORTED;
1110 u64 req_feat = CEPH_FEATURE_REQUIRED;
1111 u64 server_feat = le64_to_cpu(con->in_reply.features);
1112
1113 dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
1114
1115 switch (con->in_reply.tag) {
1116 case CEPH_MSGR_TAG_FEATURES:
1117 pr_err("%s%lld %s feature set mismatch,"
1118 " my %llx < server's %llx, missing %llx\n",
1119 ENTITY_NAME(con->peer_name),
1120 pr_addr(&con->peer_addr.in_addr),
1121 sup_feat, server_feat, server_feat & ~sup_feat);
1122 con->error_msg = "missing required protocol features";
1123 fail_protocol(con);
1124 return -1;
1125
1126 case CEPH_MSGR_TAG_BADPROTOVER:
1127 pr_err("%s%lld %s protocol version mismatch,"
1128 " my %d != server's %d\n",
1129 ENTITY_NAME(con->peer_name),
1130 pr_addr(&con->peer_addr.in_addr),
1131 le32_to_cpu(con->out_connect.protocol_version),
1132 le32_to_cpu(con->in_reply.protocol_version));
1133 con->error_msg = "protocol version mismatch";
1134 fail_protocol(con);
1135 return -1;
1136
1137 case CEPH_MSGR_TAG_BADAUTHORIZER:
1138 con->auth_retry++;
1139 dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
1140 con->auth_retry);
1141 if (con->auth_retry == 2) {
1142 con->error_msg = "connect authorization failure";
1143 reset_connection(con);
1144 set_bit(CLOSED, &con->state);
1145 return -1;
1146 }
1147 con->auth_retry = 1;
1148 prepare_write_connect(con->msgr, con, 0);
1149 prepare_read_connect_retry(con);
1150 break;
1151
1152 case CEPH_MSGR_TAG_RESETSESSION:
1153 /*
1154 * If we connected with a large connect_seq but the peer
1155 * has no record of a session with us (no connection, or
1156 * connect_seq == 0), they will send RESETSESION to indicate
1157 * that they must have reset their session, and may have
1158 * dropped messages.
1159 */
1160 dout("process_connect got RESET peer seq %u\n",
1161 le32_to_cpu(con->in_connect.connect_seq));
1162 pr_err("%s%lld %s connection reset\n",
1163 ENTITY_NAME(con->peer_name),
1164 pr_addr(&con->peer_addr.in_addr));
1165 reset_connection(con);
1166 prepare_write_connect(con->msgr, con, 0);
1167 prepare_read_connect(con);
1168
1169 /* Tell ceph about it. */
1170 mutex_unlock(&con->mutex);
1171 pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name));
1172 if (con->ops->peer_reset)
1173 con->ops->peer_reset(con);
1174 mutex_lock(&con->mutex);
1175 break;
1176
1177 case CEPH_MSGR_TAG_RETRY_SESSION:
1178 /*
1179 * If we sent a smaller connect_seq than the peer has, try
1180 * again with a larger value.
1181 */
1182 dout("process_connect got RETRY my seq = %u, peer_seq = %u\n",
1183 le32_to_cpu(con->out_connect.connect_seq),
1184 le32_to_cpu(con->in_connect.connect_seq));
1185 con->connect_seq = le32_to_cpu(con->in_connect.connect_seq);
1186 prepare_write_connect(con->msgr, con, 0);
1187 prepare_read_connect(con);
1188 break;
1189
1190 case CEPH_MSGR_TAG_RETRY_GLOBAL:
1191 /*
1192 * If we sent a smaller global_seq than the peer has, try
1193 * again with a larger value.
1194 */
1195 dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
1196 con->peer_global_seq,
1197 le32_to_cpu(con->in_connect.global_seq));
1198 get_global_seq(con->msgr,
1199 le32_to_cpu(con->in_connect.global_seq));
1200 prepare_write_connect(con->msgr, con, 0);
1201 prepare_read_connect(con);
1202 break;
1203
1204 case CEPH_MSGR_TAG_READY:
1205 if (req_feat & ~server_feat) {
1206 pr_err("%s%lld %s protocol feature mismatch,"
1207 " my required %llx > server's %llx, need %llx\n",
1208 ENTITY_NAME(con->peer_name),
1209 pr_addr(&con->peer_addr.in_addr),
1210 req_feat, server_feat, req_feat & ~server_feat);
1211 con->error_msg = "missing required protocol features";
1212 fail_protocol(con);
1213 return -1;
1214 }
1215 clear_bit(CONNECTING, &con->state);
1216 con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
1217 con->connect_seq++;
1218 dout("process_connect got READY gseq %d cseq %d (%d)\n",
1219 con->peer_global_seq,
1220 le32_to_cpu(con->in_reply.connect_seq),
1221 con->connect_seq);
1222 WARN_ON(con->connect_seq !=
1223 le32_to_cpu(con->in_reply.connect_seq));
1224
1225 if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
1226 set_bit(LOSSYTX, &con->state);
1227
1228 prepare_read_tag(con);
1229 break;
1230
1231 case CEPH_MSGR_TAG_WAIT:
1232 /*
1233 * If there is a connection race (we are opening
1234 * connections to each other), one of us may just have
1235 * to WAIT. This shouldn't happen if we are the
1236 * client.
1237 */
1238 pr_err("process_connect peer connecting WAIT\n");
1239
1240 default:
1241 pr_err("connect protocol error, will retry\n");
1242 con->error_msg = "protocol error, garbage tag during connect";
1243 return -1;
1244 }
1245 return 0;
1246}
1247
1248
1249/*
1250 * read (part of) an ack
1251 */
1252static int read_partial_ack(struct ceph_connection *con)
1253{
1254 int to = 0;
1255
1256 return read_partial(con, &to, sizeof(con->in_temp_ack),
1257 &con->in_temp_ack);
1258}
1259
1260
1261/*
1262 * We can finally discard anything that's been acked.
1263 */
1264static void process_ack(struct ceph_connection *con)
1265{
1266 struct ceph_msg *m;
1267 u64 ack = le64_to_cpu(con->in_temp_ack);
1268 u64 seq;
1269
1270 while (!list_empty(&con->out_sent)) {
1271 m = list_first_entry(&con->out_sent, struct ceph_msg,
1272 list_head);
1273 seq = le64_to_cpu(m->hdr.seq);
1274 if (seq > ack)
1275 break;
1276 dout("got ack for seq %llu type %d at %p\n", seq,
1277 le16_to_cpu(m->hdr.type), m);
1278 ceph_msg_remove(m);
1279 }
1280 prepare_read_tag(con);
1281}
1282
1283
1284
1285
1286static int read_partial_message_section(struct ceph_connection *con,
1287 struct kvec *section, unsigned int sec_len,
1288 u32 *crc)
1289{
1290 int left;
1291 int ret;
1292
1293 BUG_ON(!section);
1294
1295 while (section->iov_len < sec_len) {
1296 BUG_ON(section->iov_base == NULL);
1297 left = sec_len - section->iov_len;
1298 ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
1299 section->iov_len, left);
1300 if (ret <= 0)
1301 return ret;
1302 section->iov_len += ret;
1303 if (section->iov_len == sec_len)
1304 *crc = crc32c(0, section->iov_base,
1305 section->iov_len);
1306 }
1307
1308 return 1;
1309}
1310
1311static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
1312 struct ceph_msg_header *hdr,
1313 int *skip);
1314/*
1315 * read (part of) a message.
1316 */
1317static int read_partial_message(struct ceph_connection *con)
1318{
1319 struct ceph_msg *m = con->in_msg;
1320 void *p;
1321 int ret;
1322 int to, left;
1323 unsigned front_len, middle_len, data_len, data_off;
1324 int datacrc = con->msgr->nocrc;
1325 int skip;
1326
1327 dout("read_partial_message con %p msg %p\n", con, m);
1328
1329 /* header */
1330 while (con->in_base_pos < sizeof(con->in_hdr)) {
1331 left = sizeof(con->in_hdr) - con->in_base_pos;
1332 ret = ceph_tcp_recvmsg(con->sock,
1333 (char *)&con->in_hdr + con->in_base_pos,
1334 left);
1335 if (ret <= 0)
1336 return ret;
1337 con->in_base_pos += ret;
1338 if (con->in_base_pos == sizeof(con->in_hdr)) {
1339 u32 crc = crc32c(0, (void *)&con->in_hdr,
1340 sizeof(con->in_hdr) - sizeof(con->in_hdr.crc));
1341 if (crc != le32_to_cpu(con->in_hdr.crc)) {
1342 pr_err("read_partial_message bad hdr "
1343 " crc %u != expected %u\n",
1344 crc, con->in_hdr.crc);
1345 return -EBADMSG;
1346 }
1347 }
1348 }
1349 front_len = le32_to_cpu(con->in_hdr.front_len);
1350 if (front_len > CEPH_MSG_MAX_FRONT_LEN)
1351 return -EIO;
1352 middle_len = le32_to_cpu(con->in_hdr.middle_len);
1353 if (middle_len > CEPH_MSG_MAX_DATA_LEN)
1354 return -EIO;
1355 data_len = le32_to_cpu(con->in_hdr.data_len);
1356 if (data_len > CEPH_MSG_MAX_DATA_LEN)
1357 return -EIO;
1358 data_off = le16_to_cpu(con->in_hdr.data_off);
1359
1360 /* allocate message? */
1361 if (!con->in_msg) {
1362 dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
1363 con->in_hdr.front_len, con->in_hdr.data_len);
1364 con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
1365 if (skip) {
1366 /* skip this message */
1367 dout("alloc_msg returned NULL, skipping message\n");
1368 con->in_base_pos = -front_len - middle_len - data_len -
1369 sizeof(m->footer);
1370 con->in_tag = CEPH_MSGR_TAG_READY;
1371 return 0;
1372 }
1373 if (IS_ERR(con->in_msg)) {
1374 ret = PTR_ERR(con->in_msg);
1375 con->in_msg = NULL;
1376 con->error_msg =
1377 "error allocating memory for incoming message";
1378 return ret;
1379 }
1380 m = con->in_msg;
1381 m->front.iov_len = 0; /* haven't read it yet */
1382 if (m->middle)
1383 m->middle->vec.iov_len = 0;
1384
1385 con->in_msg_pos.page = 0;
1386 con->in_msg_pos.page_pos = data_off & ~PAGE_MASK;
1387 con->in_msg_pos.data_pos = 0;
1388 }
1389
1390 /* front */
1391 ret = read_partial_message_section(con, &m->front, front_len,
1392 &con->in_front_crc);
1393 if (ret <= 0)
1394 return ret;
1395
1396 /* middle */
1397 if (m->middle) {
1398 ret = read_partial_message_section(con, &m->middle->vec, middle_len,
1399 &con->in_middle_crc);
1400 if (ret <= 0)
1401 return ret;
1402 }
1403
1404 /* (page) data */
1405 while (con->in_msg_pos.data_pos < data_len) {
1406 left = min((int)(data_len - con->in_msg_pos.data_pos),
1407 (int)(PAGE_SIZE - con->in_msg_pos.page_pos));
1408 BUG_ON(m->pages == NULL);
1409 p = kmap(m->pages[con->in_msg_pos.page]);
1410 ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
1411 left);
1412 if (ret > 0 && datacrc)
1413 con->in_data_crc =
1414 crc32c(con->in_data_crc,
1415 p + con->in_msg_pos.page_pos, ret);
1416 kunmap(m->pages[con->in_msg_pos.page]);
1417 if (ret <= 0)
1418 return ret;
1419 con->in_msg_pos.data_pos += ret;
1420 con->in_msg_pos.page_pos += ret;
1421 if (con->in_msg_pos.page_pos == PAGE_SIZE) {
1422 con->in_msg_pos.page_pos = 0;
1423 con->in_msg_pos.page++;
1424 }
1425 }
1426
1427 /* footer */
1428 to = sizeof(m->hdr) + sizeof(m->footer);
1429 while (con->in_base_pos < to) {
1430 left = to - con->in_base_pos;
1431 ret = ceph_tcp_recvmsg(con->sock, (char *)&m->footer +
1432 (con->in_base_pos - sizeof(m->hdr)),
1433 left);
1434 if (ret <= 0)
1435 return ret;
1436 con->in_base_pos += ret;
1437 }
1438 dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
1439 m, front_len, m->footer.front_crc, middle_len,
1440 m->footer.middle_crc, data_len, m->footer.data_crc);
1441
1442 /* crc ok? */
1443 if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
1444 pr_err("read_partial_message %p front crc %u != exp. %u\n",
1445 m, con->in_front_crc, m->footer.front_crc);
1446 return -EBADMSG;
1447 }
1448 if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
1449 pr_err("read_partial_message %p middle crc %u != exp %u\n",
1450 m, con->in_middle_crc, m->footer.middle_crc);
1451 return -EBADMSG;
1452 }
1453 if (datacrc &&
1454 (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
1455 con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
1456 pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
1457 con->in_data_crc, le32_to_cpu(m->footer.data_crc));
1458 return -EBADMSG;
1459 }
1460
1461 return 1; /* done! */
1462}
1463
1464/*
1465 * Process message. This happens in the worker thread. The callback should
1466 * be careful not to do anything that waits on other incoming messages or it
1467 * may deadlock.
1468 */
1469static void process_message(struct ceph_connection *con)
1470{
1471 struct ceph_msg *msg;
1472
1473 msg = con->in_msg;
1474 con->in_msg = NULL;
1475
1476 /* if first message, set peer_name */
1477 if (con->peer_name.type == 0)
1478 con->peer_name = msg->hdr.src.name;
1479
1480 con->in_seq++;
1481 mutex_unlock(&con->mutex);
1482
1483 dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
1484 msg, le64_to_cpu(msg->hdr.seq),
1485 ENTITY_NAME(msg->hdr.src.name),
1486 le16_to_cpu(msg->hdr.type),
1487 ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
1488 le32_to_cpu(msg->hdr.front_len),
1489 le32_to_cpu(msg->hdr.data_len),
1490 con->in_front_crc, con->in_middle_crc, con->in_data_crc);
1491 con->ops->dispatch(con, msg);
1492
1493 mutex_lock(&con->mutex);
1494 prepare_read_tag(con);
1495}
1496
1497
1498/*
1499 * Write something to the socket. Called in a worker thread when the
1500 * socket appears to be writeable and we have something ready to send.
1501 */
1502static int try_write(struct ceph_connection *con)
1503{
1504 struct ceph_messenger *msgr = con->msgr;
1505 int ret = 1;
1506
1507 dout("try_write start %p state %lu nref %d\n", con, con->state,
1508 atomic_read(&con->nref));
1509
1510 mutex_lock(&con->mutex);
1511more:
1512 dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
1513
1514 /* open the socket first? */
1515 if (con->sock == NULL) {
1516 /*
1517 * if we were STANDBY and are reconnecting _this_
1518 * connection, bump connect_seq now. Always bump
1519 * global_seq.
1520 */
1521 if (test_and_clear_bit(STANDBY, &con->state))
1522 con->connect_seq++;
1523
1524 prepare_write_banner(msgr, con);
1525 prepare_write_connect(msgr, con, 1);
1526 prepare_read_banner(con);
1527 set_bit(CONNECTING, &con->state);
1528 clear_bit(NEGOTIATING, &con->state);
1529
1530 BUG_ON(con->in_msg);
1531 con->in_tag = CEPH_MSGR_TAG_READY;
1532 dout("try_write initiating connect on %p new state %lu\n",
1533 con, con->state);
1534 con->sock = ceph_tcp_connect(con);
1535 if (IS_ERR(con->sock)) {
1536 con->sock = NULL;
1537 con->error_msg = "connect error";
1538 ret = -1;
1539 goto out;
1540 }
1541 }
1542
1543more_kvec:
1544 /* kvec data queued? */
1545 if (con->out_skip) {
1546 ret = write_partial_skip(con);
1547 if (ret <= 0)
1548 goto done;
1549 if (ret < 0) {
1550 dout("try_write write_partial_skip err %d\n", ret);
1551 goto done;
1552 }
1553 }
1554 if (con->out_kvec_left) {
1555 ret = write_partial_kvec(con);
1556 if (ret <= 0)
1557 goto done;
1558 }
1559
1560 /* msg pages? */
1561 if (con->out_msg) {
1562 if (con->out_msg_done) {
1563 ceph_msg_put(con->out_msg);
1564 con->out_msg = NULL; /* we're done with this one */
1565 goto do_next;
1566 }
1567
1568 ret = write_partial_msg_pages(con);
1569 if (ret == 1)
1570 goto more_kvec; /* we need to send the footer, too! */
1571 if (ret == 0)
1572 goto done;
1573 if (ret < 0) {
1574 dout("try_write write_partial_msg_pages err %d\n",
1575 ret);
1576 goto done;
1577 }
1578 }
1579
1580do_next:
1581 if (!test_bit(CONNECTING, &con->state)) {
1582 /* is anything else pending? */
1583 if (!list_empty(&con->out_queue)) {
1584 prepare_write_message(con);
1585 goto more;
1586 }
1587 if (con->in_seq > con->in_seq_acked) {
1588 prepare_write_ack(con);
1589 goto more;
1590 }
1591 if (test_and_clear_bit(KEEPALIVE_PENDING, &con->state)) {
1592 prepare_write_keepalive(con);
1593 goto more;
1594 }
1595 }
1596
1597 /* Nothing to do! */
1598 clear_bit(WRITE_PENDING, &con->state);
1599 dout("try_write nothing else to write.\n");
1600done:
1601 ret = 0;
1602out:
1603 mutex_unlock(&con->mutex);
1604 dout("try_write done on %p\n", con);
1605 return ret;
1606}
1607
1608
1609
1610/*
1611 * Read what we can from the socket.
1612 */
1613static int try_read(struct ceph_connection *con)
1614{
1615 struct ceph_messenger *msgr;
1616 int ret = -1;
1617
1618 if (!con->sock)
1619 return 0;
1620
1621 if (test_bit(STANDBY, &con->state))
1622 return 0;
1623
1624 dout("try_read start on %p\n", con);
1625 msgr = con->msgr;
1626
1627 mutex_lock(&con->mutex);
1628
1629more:
1630 dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
1631 con->in_base_pos);
1632 if (test_bit(CONNECTING, &con->state)) {
1633 if (!test_bit(NEGOTIATING, &con->state)) {
1634 dout("try_read connecting\n");
1635 ret = read_partial_banner(con);
1636 if (ret <= 0)
1637 goto done;
1638 if (process_banner(con) < 0) {
1639 ret = -1;
1640 goto out;
1641 }
1642 }
1643 ret = read_partial_connect(con);
1644 if (ret <= 0)
1645 goto done;
1646 if (process_connect(con) < 0) {
1647 ret = -1;
1648 goto out;
1649 }
1650 goto more;
1651 }
1652
1653 if (con->in_base_pos < 0) {
1654 /*
1655 * skipping + discarding content.
1656 *
1657 * FIXME: there must be a better way to do this!
1658 */
1659 static char buf[1024];
1660 int skip = min(1024, -con->in_base_pos);
1661 dout("skipping %d / %d bytes\n", skip, -con->in_base_pos);
1662 ret = ceph_tcp_recvmsg(con->sock, buf, skip);
1663 if (ret <= 0)
1664 goto done;
1665 con->in_base_pos += ret;
1666 if (con->in_base_pos)
1667 goto more;
1668 }
1669 if (con->in_tag == CEPH_MSGR_TAG_READY) {
1670 /*
1671 * what's next?
1672 */
1673 ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
1674 if (ret <= 0)
1675 goto done;
1676 dout("try_read got tag %d\n", (int)con->in_tag);
1677 switch (con->in_tag) {
1678 case CEPH_MSGR_TAG_MSG:
1679 prepare_read_message(con);
1680 break;
1681 case CEPH_MSGR_TAG_ACK:
1682 prepare_read_ack(con);
1683 break;
1684 case CEPH_MSGR_TAG_CLOSE:
1685 set_bit(CLOSED, &con->state); /* fixme */
1686 goto done;
1687 default:
1688 goto bad_tag;
1689 }
1690 }
1691 if (con->in_tag == CEPH_MSGR_TAG_MSG) {
1692 ret = read_partial_message(con);
1693 if (ret <= 0) {
1694 switch (ret) {
1695 case -EBADMSG:
1696 con->error_msg = "bad crc";
1697 ret = -EIO;
1698 goto out;
1699 case -EIO:
1700 con->error_msg = "io error";
1701 goto out;
1702 default:
1703 goto done;
1704 }
1705 }
1706 if (con->in_tag == CEPH_MSGR_TAG_READY)
1707 goto more;
1708 process_message(con);
1709 goto more;
1710 }
1711 if (con->in_tag == CEPH_MSGR_TAG_ACK) {
1712 ret = read_partial_ack(con);
1713 if (ret <= 0)
1714 goto done;
1715 process_ack(con);
1716 goto more;
1717 }
1718
1719done:
1720 ret = 0;
1721out:
1722 mutex_unlock(&con->mutex);
1723 dout("try_read done on %p\n", con);
1724 return ret;
1725
1726bad_tag:
1727 pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag);
1728 con->error_msg = "protocol error, garbage tag";
1729 ret = -1;
1730 goto out;
1731}
1732
1733
1734/*
1735 * Atomically queue work on a connection. Bump @con reference to
1736 * avoid races with connection teardown.
1737 *
1738 * There is some trickery going on with QUEUED and BUSY because we
1739 * only want a _single_ thread operating on each connection at any
1740 * point in time, but we want to use all available CPUs.
1741 *
1742 * The worker thread only proceeds if it can atomically set BUSY. It
1743 * clears QUEUED and does it's thing. When it thinks it's done, it
1744 * clears BUSY, then rechecks QUEUED.. if it's set again, it loops
1745 * (tries again to set BUSY).
1746 *
1747 * To queue work, we first set QUEUED, _then_ if BUSY isn't set, we
1748 * try to queue work. If that fails (work is already queued, or BUSY)
1749 * we give up (work also already being done or is queued) but leave QUEUED
1750 * set so that the worker thread will loop if necessary.
1751 */
1752static void queue_con(struct ceph_connection *con)
1753{
1754 if (test_bit(DEAD, &con->state)) {
1755 dout("queue_con %p ignoring: DEAD\n",
1756 con);
1757 return;
1758 }
1759
1760 if (!con->ops->get(con)) {
1761 dout("queue_con %p ref count 0\n", con);
1762 return;
1763 }
1764
1765 set_bit(QUEUED, &con->state);
1766 if (test_bit(BUSY, &con->state)) {
1767 dout("queue_con %p - already BUSY\n", con);
1768 con->ops->put(con);
1769 } else if (!queue_work(ceph_msgr_wq, &con->work.work)) {
1770 dout("queue_con %p - already queued\n", con);
1771 con->ops->put(con);
1772 } else {
1773 dout("queue_con %p\n", con);
1774 }
1775}
1776
1777/*
1778 * Do some work on a connection. Drop a connection ref when we're done.
1779 */
1780static void con_work(struct work_struct *work)
1781{
1782 struct ceph_connection *con = container_of(work, struct ceph_connection,
1783 work.work);
1784 int backoff = 0;
1785
1786more:
1787 if (test_and_set_bit(BUSY, &con->state) != 0) {
1788 dout("con_work %p BUSY already set\n", con);
1789 goto out;
1790 }
1791 dout("con_work %p start, clearing QUEUED\n", con);
1792 clear_bit(QUEUED, &con->state);
1793
1794 if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
1795 dout("con_work CLOSED\n");
1796 con_close_socket(con);
1797 goto done;
1798 }
1799 if (test_and_clear_bit(OPENING, &con->state)) {
1800 /* reopen w/ new peer */
1801 dout("con_work OPENING\n");
1802 con_close_socket(con);
1803 }
1804
1805 if (test_and_clear_bit(SOCK_CLOSED, &con->state) ||
1806 try_read(con) < 0 ||
1807 try_write(con) < 0) {
1808 backoff = 1;
1809 ceph_fault(con); /* error/fault path */
1810 }
1811
1812done:
1813 clear_bit(BUSY, &con->state);
1814 dout("con->state=%lu\n", con->state);
1815 if (test_bit(QUEUED, &con->state)) {
1816 if (!backoff || test_bit(OPENING, &con->state)) {
1817 dout("con_work %p QUEUED reset, looping\n", con);
1818 goto more;
1819 }
1820 dout("con_work %p QUEUED reset, but just faulted\n", con);
1821 clear_bit(QUEUED, &con->state);
1822 }
1823 dout("con_work %p done\n", con);
1824
1825out:
1826 con->ops->put(con);
1827}
1828
1829
1830/*
1831 * Generic error/fault handler. A retry mechanism is used with
1832 * exponential backoff
1833 */
1834static void ceph_fault(struct ceph_connection *con)
1835{
1836 pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
1837 pr_addr(&con->peer_addr.in_addr), con->error_msg);
1838 dout("fault %p state %lu to peer %s\n",
1839 con, con->state, pr_addr(&con->peer_addr.in_addr));
1840
1841 if (test_bit(LOSSYTX, &con->state)) {
1842 dout("fault on LOSSYTX channel\n");
1843 goto out;
1844 }
1845
1846 clear_bit(BUSY, &con->state); /* to avoid an improbable race */
1847
1848 mutex_lock(&con->mutex);
1849 if (test_bit(CLOSED, &con->state))
1850 goto out_unlock;
1851
1852 con_close_socket(con);
1853
1854 if (con->in_msg) {
1855 ceph_msg_put(con->in_msg);
1856 con->in_msg = NULL;
1857 }
1858
1859 /* Requeue anything that hasn't been acked */
1860 list_splice_init(&con->out_sent, &con->out_queue);
1861
1862 /* If there are no messages in the queue, place the connection
1863 * in a STANDBY state (i.e., don't try to reconnect just yet). */
1864 if (list_empty(&con->out_queue) && !con->out_keepalive_pending) {
1865 dout("fault setting STANDBY\n");
1866 set_bit(STANDBY, &con->state);
1867 } else {
1868 /* retry after a delay. */
1869 if (con->delay == 0)
1870 con->delay = BASE_DELAY_INTERVAL;
1871 else if (con->delay < MAX_DELAY_INTERVAL)
1872 con->delay *= 2;
1873 dout("fault queueing %p delay %lu\n", con, con->delay);
1874 con->ops->get(con);
1875 if (queue_delayed_work(ceph_msgr_wq, &con->work,
1876 round_jiffies_relative(con->delay)) == 0)
1877 con->ops->put(con);
1878 }
1879
1880out_unlock:
1881 mutex_unlock(&con->mutex);
1882out:
1883 /*
1884 * in case we faulted due to authentication, invalidate our
1885 * current tickets so that we can get new ones.
1886 */
1887 if (con->auth_retry && con->ops->invalidate_authorizer) {
1888 dout("calling invalidate_authorizer()\n");
1889 con->ops->invalidate_authorizer(con);
1890 }
1891
1892 if (con->ops->fault)
1893 con->ops->fault(con);
1894}
1895
1896
1897
1898/*
1899 * create a new messenger instance
1900 */
1901struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
1902{
1903 struct ceph_messenger *msgr;
1904
1905 msgr = kzalloc(sizeof(*msgr), GFP_KERNEL);
1906 if (msgr == NULL)
1907 return ERR_PTR(-ENOMEM);
1908
1909 spin_lock_init(&msgr->global_seq_lock);
1910
1911 /* the zero page is needed if a request is "canceled" while the message
1912 * is being written over the socket */
1913 msgr->zero_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
1914 if (!msgr->zero_page) {
1915 kfree(msgr);
1916 return ERR_PTR(-ENOMEM);
1917 }
1918 kmap(msgr->zero_page);
1919
1920 if (myaddr)
1921 msgr->inst.addr = *myaddr;
1922
1923 /* select a random nonce */
1924 msgr->inst.addr.type = 0;
1925 get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
1926 encode_my_addr(msgr);
1927
1928 dout("messenger_create %p\n", msgr);
1929 return msgr;
1930}
1931
1932void ceph_messenger_destroy(struct ceph_messenger *msgr)
1933{
1934 dout("destroy %p\n", msgr);
1935 kunmap(msgr->zero_page);
1936 __free_page(msgr->zero_page);
1937 kfree(msgr);
1938 dout("destroyed messenger %p\n", msgr);
1939}
1940
1941/*
1942 * Queue up an outgoing message on the given connection.
1943 */
1944void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
1945{
1946 if (test_bit(CLOSED, &con->state)) {
1947 dout("con_send %p closed, dropping %p\n", con, msg);
1948 ceph_msg_put(msg);
1949 return;
1950 }
1951
1952 /* set src+dst */
1953 msg->hdr.src.name = con->msgr->inst.name;
1954 msg->hdr.src.addr = con->msgr->my_enc_addr;
1955 msg->hdr.orig_src = msg->hdr.src;
1956
1957 BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
1958
1959 /* queue */
1960 mutex_lock(&con->mutex);
1961 BUG_ON(!list_empty(&msg->list_head));
1962 list_add_tail(&msg->list_head, &con->out_queue);
1963 dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg,
1964 ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type),
1965 ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
1966 le32_to_cpu(msg->hdr.front_len),
1967 le32_to_cpu(msg->hdr.middle_len),
1968 le32_to_cpu(msg->hdr.data_len));
1969 mutex_unlock(&con->mutex);
1970
1971 /* if there wasn't anything waiting to send before, queue
1972 * new work */
1973 if (test_and_set_bit(WRITE_PENDING, &con->state) == 0)
1974 queue_con(con);
1975}
1976
1977/*
1978 * Revoke a message that was previously queued for send
1979 */
1980void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
1981{
1982 mutex_lock(&con->mutex);
1983 if (!list_empty(&msg->list_head)) {
1984 dout("con_revoke %p msg %p\n", con, msg);
1985 list_del_init(&msg->list_head);
1986 ceph_msg_put(msg);
1987 msg->hdr.seq = 0;
1988 if (con->out_msg == msg) {
1989 ceph_msg_put(con->out_msg);
1990 con->out_msg = NULL;
1991 }
1992 if (con->out_kvec_is_msg) {
1993 con->out_skip = con->out_kvec_bytes;
1994 con->out_kvec_is_msg = false;
1995 }
1996 } else {
1997 dout("con_revoke %p msg %p - not queued (sent?)\n", con, msg);
1998 }
1999 mutex_unlock(&con->mutex);
2000}
2001
2002/*
2003 * Revoke a message that we may be reading data into
2004 */
2005void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
2006{
2007 mutex_lock(&con->mutex);
2008 if (con->in_msg && con->in_msg == msg) {
2009 unsigned front_len = le32_to_cpu(con->in_hdr.front_len);
2010 unsigned middle_len = le32_to_cpu(con->in_hdr.middle_len);
2011 unsigned data_len = le32_to_cpu(con->in_hdr.data_len);
2012
2013 /* skip rest of message */
2014 dout("con_revoke_pages %p msg %p revoked\n", con, msg);
2015 con->in_base_pos = con->in_base_pos -
2016 sizeof(struct ceph_msg_header) -
2017 front_len -
2018 middle_len -
2019 data_len -
2020 sizeof(struct ceph_msg_footer);
2021 ceph_msg_put(con->in_msg);
2022 con->in_msg = NULL;
2023 con->in_tag = CEPH_MSGR_TAG_READY;
2024 } else {
2025 dout("con_revoke_pages %p msg %p pages %p no-op\n",
2026 con, con->in_msg, msg);
2027 }
2028 mutex_unlock(&con->mutex);
2029}
2030
2031/*
2032 * Queue a keepalive byte to ensure the tcp connection is alive.
2033 */
2034void ceph_con_keepalive(struct ceph_connection *con)
2035{
2036 if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 &&
2037 test_and_set_bit(WRITE_PENDING, &con->state) == 0)
2038 queue_con(con);
2039}
2040
2041
2042/*
2043 * construct a new message with given type, size
2044 * the new msg has a ref count of 1.
2045 */
2046struct ceph_msg *ceph_msg_new(int type, int front_len,
2047 int page_len, int page_off, struct page **pages)
2048{
2049 struct ceph_msg *m;
2050
2051 m = kmalloc(sizeof(*m), GFP_NOFS);
2052 if (m == NULL)
2053 goto out;
2054 kref_init(&m->kref);
2055 INIT_LIST_HEAD(&m->list_head);
2056
2057 m->hdr.type = cpu_to_le16(type);
2058 m->hdr.front_len = cpu_to_le32(front_len);
2059 m->hdr.middle_len = 0;
2060 m->hdr.data_len = cpu_to_le32(page_len);
2061 m->hdr.data_off = cpu_to_le16(page_off);
2062 m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
2063 m->footer.front_crc = 0;
2064 m->footer.middle_crc = 0;
2065 m->footer.data_crc = 0;
2066 m->front_max = front_len;
2067 m->front_is_vmalloc = false;
2068 m->more_to_follow = false;
2069 m->pool = NULL;
2070
2071 /* front */
2072 if (front_len) {
2073 if (front_len > PAGE_CACHE_SIZE) {
2074 m->front.iov_base = __vmalloc(front_len, GFP_NOFS,
2075 PAGE_KERNEL);
2076 m->front_is_vmalloc = true;
2077 } else {
2078 m->front.iov_base = kmalloc(front_len, GFP_NOFS);
2079 }
2080 if (m->front.iov_base == NULL) {
2081 pr_err("msg_new can't allocate %d bytes\n",
2082 front_len);
2083 goto out2;
2084 }
2085 } else {
2086 m->front.iov_base = NULL;
2087 }
2088 m->front.iov_len = front_len;
2089
2090 /* middle */
2091 m->middle = NULL;
2092
2093 /* data */
2094 m->nr_pages = calc_pages_for(page_off, page_len);
2095 m->pages = pages;
2096 m->pagelist = NULL;
2097
2098 dout("ceph_msg_new %p page %d~%d -> %d\n", m, page_off, page_len,
2099 m->nr_pages);
2100 return m;
2101
2102out2:
2103 ceph_msg_put(m);
2104out:
2105 pr_err("msg_new can't create type %d len %d\n", type, front_len);
2106 return ERR_PTR(-ENOMEM);
2107}
2108
2109/*
2110 * Allocate "middle" portion of a message, if it is needed and wasn't
2111 * allocated by alloc_msg. This allows us to read a small fixed-size
2112 * per-type header in the front and then gracefully fail (i.e.,
2113 * propagate the error to the caller based on info in the front) when
2114 * the middle is too large.
2115 */
2116static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
2117{
2118 int type = le16_to_cpu(msg->hdr.type);
2119 int middle_len = le32_to_cpu(msg->hdr.middle_len);
2120
2121 dout("alloc_middle %p type %d %s middle_len %d\n", msg, type,
2122 ceph_msg_type_name(type), middle_len);
2123 BUG_ON(!middle_len);
2124 BUG_ON(msg->middle);
2125
2126 msg->middle = ceph_buffer_new(middle_len, GFP_NOFS);
2127 if (!msg->middle)
2128 return -ENOMEM;
2129 return 0;
2130}
2131
2132/*
2133 * Generic message allocator, for incoming messages.
2134 */
2135static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
2136 struct ceph_msg_header *hdr,
2137 int *skip)
2138{
2139 int type = le16_to_cpu(hdr->type);
2140 int front_len = le32_to_cpu(hdr->front_len);
2141 int middle_len = le32_to_cpu(hdr->middle_len);
2142 struct ceph_msg *msg = NULL;
2143 int ret;
2144
2145 if (con->ops->alloc_msg) {
2146 mutex_unlock(&con->mutex);
2147 msg = con->ops->alloc_msg(con, hdr, skip);
2148 mutex_lock(&con->mutex);
2149 if (IS_ERR(msg))
2150 return msg;
2151
2152 if (*skip)
2153 return NULL;
2154 }
2155 if (!msg) {
2156 *skip = 0;
2157 msg = ceph_msg_new(type, front_len, 0, 0, NULL);
2158 if (!msg) {
2159 pr_err("unable to allocate msg type %d len %d\n",
2160 type, front_len);
2161 return ERR_PTR(-ENOMEM);
2162 }
2163 }
2164 memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
2165
2166 if (middle_len) {
2167 ret = ceph_alloc_middle(con, msg);
2168
2169 if (ret < 0) {
2170 ceph_msg_put(msg);
2171 return msg;
2172 }
2173 }
2174
2175 return msg;
2176}
2177
2178
2179/*
2180 * Free a generically kmalloc'd message.
2181 */
2182void ceph_msg_kfree(struct ceph_msg *m)
2183{
2184 dout("msg_kfree %p\n", m);
2185 if (m->front_is_vmalloc)
2186 vfree(m->front.iov_base);
2187 else
2188 kfree(m->front.iov_base);
2189 kfree(m);
2190}
2191
2192/*
2193 * Drop a msg ref. Destroy as needed.
2194 */
2195void ceph_msg_last_put(struct kref *kref)
2196{
2197 struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
2198
2199 dout("ceph_msg_put last one on %p\n", m);
2200 WARN_ON(!list_empty(&m->list_head));
2201
2202 /* drop middle, data, if any */
2203 if (m->middle) {
2204 ceph_buffer_put(m->middle);
2205 m->middle = NULL;
2206 }
2207 m->nr_pages = 0;
2208 m->pages = NULL;
2209
2210 if (m->pagelist) {
2211 ceph_pagelist_release(m->pagelist);
2212 kfree(m->pagelist);
2213 m->pagelist = NULL;
2214 }
2215
2216 if (m->pool)
2217 ceph_msgpool_put(m->pool, m);
2218 else
2219 ceph_msg_kfree(m);
2220}
2221
2222void ceph_msg_dump(struct ceph_msg *msg)
2223{
2224 pr_debug("msg_dump %p (front_max %d nr_pages %d)\n", msg,
2225 msg->front_max, msg->nr_pages);
2226 print_hex_dump(KERN_DEBUG, "header: ",
2227 DUMP_PREFIX_OFFSET, 16, 1,
2228 &msg->hdr, sizeof(msg->hdr), true);
2229 print_hex_dump(KERN_DEBUG, " front: ",
2230 DUMP_PREFIX_OFFSET, 16, 1,
2231 msg->front.iov_base, msg->front.iov_len, true);
2232 if (msg->middle)
2233 print_hex_dump(KERN_DEBUG, "middle: ",
2234 DUMP_PREFIX_OFFSET, 16, 1,
2235 msg->middle->vec.iov_base,
2236 msg->middle->vec.iov_len, true);
2237 print_hex_dump(KERN_DEBUG, "footer: ",
2238 DUMP_PREFIX_OFFSET, 16, 1,
2239 &msg->footer, sizeof(msg->footer), true);
2240}
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
new file mode 100644
index 000000000000..4caaa5911110
--- /dev/null
+++ b/fs/ceph/messenger.h
@@ -0,0 +1,254 @@
1#ifndef __FS_CEPH_MESSENGER_H
2#define __FS_CEPH_MESSENGER_H
3
4#include <linux/kref.h>
5#include <linux/mutex.h>
6#include <linux/net.h>
7#include <linux/radix-tree.h>
8#include <linux/uio.h>
9#include <linux/version.h>
10#include <linux/workqueue.h>
11
12#include "types.h"
13#include "buffer.h"
14
15struct ceph_msg;
16struct ceph_connection;
17
18extern struct workqueue_struct *ceph_msgr_wq; /* receive work queue */
19
20/*
21 * Ceph defines these callbacks for handling connection events.
22 */
23struct ceph_connection_operations {
24 struct ceph_connection *(*get)(struct ceph_connection *);
25 void (*put)(struct ceph_connection *);
26
27 /* handle an incoming message. */
28 void (*dispatch) (struct ceph_connection *con, struct ceph_msg *m);
29
30 /* authorize an outgoing connection */
31 int (*get_authorizer) (struct ceph_connection *con,
32 void **buf, int *len, int *proto,
33 void **reply_buf, int *reply_len, int force_new);
34 int (*verify_authorizer_reply) (struct ceph_connection *con, int len);
35 int (*invalidate_authorizer)(struct ceph_connection *con);
36
37 /* protocol version mismatch */
38 void (*bad_proto) (struct ceph_connection *con);
39
40 /* there was some error on the socket (disconnect, whatever) */
41 void (*fault) (struct ceph_connection *con);
42
43 /* a remote host as terminated a message exchange session, and messages
44 * we sent (or they tried to send us) may be lost. */
45 void (*peer_reset) (struct ceph_connection *con);
46
47 struct ceph_msg * (*alloc_msg) (struct ceph_connection *con,
48 struct ceph_msg_header *hdr,
49 int *skip);
50};
51
52extern const char *ceph_name_type_str(int t);
53
54/* use format string %s%d */
55#define ENTITY_NAME(n) ceph_name_type_str((n).type), le64_to_cpu((n).num)
56
57struct ceph_messenger {
58 struct ceph_entity_inst inst; /* my name+address */
59 struct ceph_entity_addr my_enc_addr;
60 struct page *zero_page; /* used in certain error cases */
61
62 bool nocrc;
63
64 /*
65 * the global_seq counts connections i (attempt to) initiate
66 * in order to disambiguate certain connect race conditions.
67 */
68 u32 global_seq;
69 spinlock_t global_seq_lock;
70};
71
72/*
73 * a single message. it contains a header (src, dest, message type, etc.),
74 * footer (crc values, mainly), a "front" message body, and possibly a
75 * data payload (stored in some number of pages).
76 */
77struct ceph_msg {
78 struct ceph_msg_header hdr; /* header */
79 struct ceph_msg_footer footer; /* footer */
80 struct kvec front; /* unaligned blobs of message */
81 struct ceph_buffer *middle;
82 struct page **pages; /* data payload. NOT OWNER. */
83 unsigned nr_pages; /* size of page array */
84 struct ceph_pagelist *pagelist; /* instead of pages */
85 struct list_head list_head;
86 struct kref kref;
87 bool front_is_vmalloc;
88 bool more_to_follow;
89 int front_max;
90
91 struct ceph_msgpool *pool;
92};
93
94struct ceph_msg_pos {
95 int page, page_pos; /* which page; offset in page */
96 int data_pos; /* offset in data payload */
97 int did_page_crc; /* true if we've calculated crc for current page */
98};
99
100/* ceph connection fault delay defaults, for exponential backoff */
101#define BASE_DELAY_INTERVAL (HZ/2)
102#define MAX_DELAY_INTERVAL (5 * 60 * HZ)
103
104/*
105 * ceph_connection state bit flags
106 *
107 * QUEUED and BUSY are used together to ensure that only a single
108 * thread is currently opening, reading or writing data to the socket.
109 */
110#define LOSSYTX 0 /* we can close channel or drop messages on errors */
111#define CONNECTING 1
112#define NEGOTIATING 2
113#define KEEPALIVE_PENDING 3
114#define WRITE_PENDING 4 /* we have data ready to send */
115#define QUEUED 5 /* there is work queued on this connection */
116#define BUSY 6 /* work is being done */
117#define STANDBY 8 /* no outgoing messages, socket closed. we keep
118 * the ceph_connection around to maintain shared
119 * state with the peer. */
120#define CLOSED 10 /* we've closed the connection */
121#define SOCK_CLOSED 11 /* socket state changed to closed */
122#define OPENING 13 /* open connection w/ (possibly new) peer */
123#define DEAD 14 /* dead, about to kfree */
124
125/*
126 * A single connection with another host.
127 *
128 * We maintain a queue of outgoing messages, and some session state to
129 * ensure that we can preserve the lossless, ordered delivery of
130 * messages in the case of a TCP disconnect.
131 */
132struct ceph_connection {
133 void *private;
134 atomic_t nref;
135
136 const struct ceph_connection_operations *ops;
137
138 struct ceph_messenger *msgr;
139 struct socket *sock;
140 unsigned long state; /* connection state (see flags above) */
141 const char *error_msg; /* error message, if any */
142
143 struct ceph_entity_addr peer_addr; /* peer address */
144 struct ceph_entity_name peer_name; /* peer name */
145 struct ceph_entity_addr peer_addr_for_me;
146 u32 connect_seq; /* identify the most recent connection
147 attempt for this connection, client */
148 u32 peer_global_seq; /* peer's global seq for this connection */
149
150 int auth_retry; /* true if we need a newer authorizer */
151 void *auth_reply_buf; /* where to put the authorizer reply */
152 int auth_reply_buf_len;
153
154 struct mutex mutex;
155
156 /* out queue */
157 struct list_head out_queue;
158 struct list_head out_sent; /* sending or sent but unacked */
159 u64 out_seq; /* last message queued for send */
160 u64 out_seq_sent; /* last message sent */
161 bool out_keepalive_pending;
162
163 u64 in_seq, in_seq_acked; /* last message received, acked */
164
165 /* connection negotiation temps */
166 char in_banner[CEPH_BANNER_MAX_LEN];
167 union {
168 struct { /* outgoing connection */
169 struct ceph_msg_connect out_connect;
170 struct ceph_msg_connect_reply in_reply;
171 };
172 struct { /* incoming */
173 struct ceph_msg_connect in_connect;
174 struct ceph_msg_connect_reply out_reply;
175 };
176 };
177 struct ceph_entity_addr actual_peer_addr;
178
179 /* message out temps */
180 struct ceph_msg *out_msg; /* sending message (== tail of
181 out_sent) */
182 bool out_msg_done;
183 struct ceph_msg_pos out_msg_pos;
184
185 struct kvec out_kvec[8], /* sending header/footer data */
186 *out_kvec_cur;
187 int out_kvec_left; /* kvec's left in out_kvec */
188 int out_skip; /* skip this many bytes */
189 int out_kvec_bytes; /* total bytes left */
190 bool out_kvec_is_msg; /* kvec refers to out_msg */
191 int out_more; /* there is more data after the kvecs */
192 __le64 out_temp_ack; /* for writing an ack */
193
194 /* message in temps */
195 struct ceph_msg_header in_hdr;
196 struct ceph_msg *in_msg;
197 struct ceph_msg_pos in_msg_pos;
198 u32 in_front_crc, in_middle_crc, in_data_crc; /* calculated crc */
199
200 char in_tag; /* protocol control byte */
201 int in_base_pos; /* bytes read */
202 __le64 in_temp_ack; /* for reading an ack */
203
204 struct delayed_work work; /* send|recv work */
205 unsigned long delay; /* current delay interval */
206};
207
208
209extern const char *pr_addr(const struct sockaddr_storage *ss);
210extern int ceph_parse_ips(const char *c, const char *end,
211 struct ceph_entity_addr *addr,
212 int max_count, int *count);
213
214
215extern int ceph_msgr_init(void);
216extern void ceph_msgr_exit(void);
217
218extern struct ceph_messenger *ceph_messenger_create(
219 struct ceph_entity_addr *myaddr);
220extern void ceph_messenger_destroy(struct ceph_messenger *);
221
222extern void ceph_con_init(struct ceph_messenger *msgr,
223 struct ceph_connection *con);
224extern void ceph_con_open(struct ceph_connection *con,
225 struct ceph_entity_addr *addr);
226extern void ceph_con_close(struct ceph_connection *con);
227extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg);
228extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg);
229extern void ceph_con_revoke_message(struct ceph_connection *con,
230 struct ceph_msg *msg);
231extern void ceph_con_keepalive(struct ceph_connection *con);
232extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
233extern void ceph_con_put(struct ceph_connection *con);
234
235extern struct ceph_msg *ceph_msg_new(int type, int front_len,
236 int page_len, int page_off,
237 struct page **pages);
238extern void ceph_msg_kfree(struct ceph_msg *m);
239
240
241static inline struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
242{
243 kref_get(&msg->kref);
244 return msg;
245}
246extern void ceph_msg_last_put(struct kref *kref);
247static inline void ceph_msg_put(struct ceph_msg *msg)
248{
249 kref_put(&msg->kref, ceph_msg_last_put);
250}
251
252extern void ceph_msg_dump(struct ceph_msg *msg);
253
254#endif
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
new file mode 100644
index 000000000000..890597c09d43
--- /dev/null
+++ b/fs/ceph/mon_client.c
@@ -0,0 +1,834 @@
1#include "ceph_debug.h"
2
3#include <linux/types.h>
4#include <linux/random.h>
5#include <linux/sched.h>
6
7#include "mon_client.h"
8#include "super.h"
9#include "auth.h"
10#include "decode.h"
11
12/*
13 * Interact with Ceph monitor cluster. Handle requests for new map
14 * versions, and periodically resend as needed. Also implement
15 * statfs() and umount().
16 *
17 * A small cluster of Ceph "monitors" are responsible for managing critical
18 * cluster configuration and state information. An odd number (e.g., 3, 5)
19 * of cmon daemons use a modified version of the Paxos part-time parliament
20 * algorithm to manage the MDS map (mds cluster membership), OSD map, and
21 * list of clients who have mounted the file system.
22 *
23 * We maintain an open, active session with a monitor at all times in order to
24 * receive timely MDSMap updates. We periodically send a keepalive byte on the
25 * TCP socket to ensure we detect a failure. If the connection does break, we
26 * randomly hunt for a new monitor. Once the connection is reestablished, we
27 * resend any outstanding requests.
28 */
29
30const static struct ceph_connection_operations mon_con_ops;
31
32static int __validate_auth(struct ceph_mon_client *monc);
33
34/*
35 * Decode a monmap blob (e.g., during mount).
36 */
37struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
38{
39 struct ceph_monmap *m = NULL;
40 int i, err = -EINVAL;
41 struct ceph_fsid fsid;
42 u32 epoch, num_mon;
43 u16 version;
44 u32 len;
45
46 ceph_decode_32_safe(&p, end, len, bad);
47 ceph_decode_need(&p, end, len, bad);
48
49 dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p));
50
51 ceph_decode_16_safe(&p, end, version, bad);
52
53 ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
54 ceph_decode_copy(&p, &fsid, sizeof(fsid));
55 epoch = ceph_decode_32(&p);
56
57 num_mon = ceph_decode_32(&p);
58 ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
59
60 if (num_mon >= CEPH_MAX_MON)
61 goto bad;
62 m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS);
63 if (m == NULL)
64 return ERR_PTR(-ENOMEM);
65 m->fsid = fsid;
66 m->epoch = epoch;
67 m->num_mon = num_mon;
68 ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0]));
69 for (i = 0; i < num_mon; i++)
70 ceph_decode_addr(&m->mon_inst[i].addr);
71
72 dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
73 m->num_mon);
74 for (i = 0; i < m->num_mon; i++)
75 dout("monmap_decode mon%d is %s\n", i,
76 pr_addr(&m->mon_inst[i].addr.in_addr));
77 return m;
78
79bad:
80 dout("monmap_decode failed with %d\n", err);
81 kfree(m);
82 return ERR_PTR(err);
83}
84
85/*
86 * return true if *addr is included in the monmap.
87 */
88int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
89{
90 int i;
91
92 for (i = 0; i < m->num_mon; i++)
93 if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0)
94 return 1;
95 return 0;
96}
97
98/*
99 * Send an auth request.
100 */
101static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
102{
103 monc->pending_auth = 1;
104 monc->m_auth->front.iov_len = len;
105 monc->m_auth->hdr.front_len = cpu_to_le32(len);
106 ceph_msg_get(monc->m_auth); /* keep our ref */
107 ceph_con_send(monc->con, monc->m_auth);
108}
109
110/*
111 * Close monitor session, if any.
112 */
113static void __close_session(struct ceph_mon_client *monc)
114{
115 if (monc->con) {
116 dout("__close_session closing mon%d\n", monc->cur_mon);
117 ceph_con_revoke(monc->con, monc->m_auth);
118 ceph_con_close(monc->con);
119 monc->cur_mon = -1;
120 monc->pending_auth = 0;
121 ceph_auth_reset(monc->auth);
122 }
123}
124
125/*
126 * Open a session with a (new) monitor.
127 */
128static int __open_session(struct ceph_mon_client *monc)
129{
130 char r;
131 int ret;
132
133 if (monc->cur_mon < 0) {
134 get_random_bytes(&r, 1);
135 monc->cur_mon = r % monc->monmap->num_mon;
136 dout("open_session num=%d r=%d -> mon%d\n",
137 monc->monmap->num_mon, r, monc->cur_mon);
138 monc->sub_sent = 0;
139 monc->sub_renew_after = jiffies; /* i.e., expired */
140 monc->want_next_osdmap = !!monc->want_next_osdmap;
141
142 dout("open_session mon%d opening\n", monc->cur_mon);
143 monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON;
144 monc->con->peer_name.num = cpu_to_le64(monc->cur_mon);
145 ceph_con_open(monc->con,
146 &monc->monmap->mon_inst[monc->cur_mon].addr);
147
148 /* initiatiate authentication handshake */
149 ret = ceph_auth_build_hello(monc->auth,
150 monc->m_auth->front.iov_base,
151 monc->m_auth->front_max);
152 __send_prepared_auth_request(monc, ret);
153 } else {
154 dout("open_session mon%d already open\n", monc->cur_mon);
155 }
156 return 0;
157}
158
159static bool __sub_expired(struct ceph_mon_client *monc)
160{
161 return time_after_eq(jiffies, monc->sub_renew_after);
162}
163
164/*
165 * Reschedule delayed work timer.
166 */
167static void __schedule_delayed(struct ceph_mon_client *monc)
168{
169 unsigned delay;
170
171 if (monc->cur_mon < 0 || __sub_expired(monc))
172 delay = 10 * HZ;
173 else
174 delay = 20 * HZ;
175 dout("__schedule_delayed after %u\n", delay);
176 schedule_delayed_work(&monc->delayed_work, delay);
177}
178
179/*
180 * Send subscribe request for mdsmap and/or osdmap.
181 */
182static void __send_subscribe(struct ceph_mon_client *monc)
183{
184 dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n",
185 (unsigned)monc->sub_sent, __sub_expired(monc),
186 monc->want_next_osdmap);
187 if ((__sub_expired(monc) && !monc->sub_sent) ||
188 monc->want_next_osdmap == 1) {
189 struct ceph_msg *msg;
190 struct ceph_mon_subscribe_item *i;
191 void *p, *end;
192
193 msg = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, 0, 0, NULL);
194 if (!msg)
195 return;
196
197 p = msg->front.iov_base;
198 end = p + msg->front.iov_len;
199
200 dout("__send_subscribe to 'mdsmap' %u+\n",
201 (unsigned)monc->have_mdsmap);
202 if (monc->want_next_osdmap) {
203 dout("__send_subscribe to 'osdmap' %u\n",
204 (unsigned)monc->have_osdmap);
205 ceph_encode_32(&p, 3);
206 ceph_encode_string(&p, end, "osdmap", 6);
207 i = p;
208 i->have = cpu_to_le64(monc->have_osdmap);
209 i->onetime = 1;
210 p += sizeof(*i);
211 monc->want_next_osdmap = 2; /* requested */
212 } else {
213 ceph_encode_32(&p, 2);
214 }
215 ceph_encode_string(&p, end, "mdsmap", 6);
216 i = p;
217 i->have = cpu_to_le64(monc->have_mdsmap);
218 i->onetime = 0;
219 p += sizeof(*i);
220 ceph_encode_string(&p, end, "monmap", 6);
221 i = p;
222 i->have = 0;
223 i->onetime = 0;
224 p += sizeof(*i);
225
226 msg->front.iov_len = p - msg->front.iov_base;
227 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
228 ceph_con_send(monc->con, msg);
229
230 monc->sub_sent = jiffies | 1; /* never 0 */
231 }
232}
233
234static void handle_subscribe_ack(struct ceph_mon_client *monc,
235 struct ceph_msg *msg)
236{
237 unsigned seconds;
238 struct ceph_mon_subscribe_ack *h = msg->front.iov_base;
239
240 if (msg->front.iov_len < sizeof(*h))
241 goto bad;
242 seconds = le32_to_cpu(h->duration);
243
244 mutex_lock(&monc->mutex);
245 if (monc->hunting) {
246 pr_info("mon%d %s session established\n",
247 monc->cur_mon, pr_addr(&monc->con->peer_addr.in_addr));
248 monc->hunting = false;
249 }
250 dout("handle_subscribe_ack after %d seconds\n", seconds);
251 monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1;
252 monc->sub_sent = 0;
253 mutex_unlock(&monc->mutex);
254 return;
255bad:
256 pr_err("got corrupt subscribe-ack msg\n");
257 ceph_msg_dump(msg);
258}
259
260/*
261 * Keep track of which maps we have
262 */
263int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got)
264{
265 mutex_lock(&monc->mutex);
266 monc->have_mdsmap = got;
267 mutex_unlock(&monc->mutex);
268 return 0;
269}
270
271int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got)
272{
273 mutex_lock(&monc->mutex);
274 monc->have_osdmap = got;
275 monc->want_next_osdmap = 0;
276 mutex_unlock(&monc->mutex);
277 return 0;
278}
279
280/*
281 * Register interest in the next osdmap
282 */
283void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
284{
285 dout("request_next_osdmap have %u\n", monc->have_osdmap);
286 mutex_lock(&monc->mutex);
287 if (!monc->want_next_osdmap)
288 monc->want_next_osdmap = 1;
289 if (monc->want_next_osdmap < 2)
290 __send_subscribe(monc);
291 mutex_unlock(&monc->mutex);
292}
293
294/*
295 *
296 */
297int ceph_monc_open_session(struct ceph_mon_client *monc)
298{
299 if (!monc->con) {
300 monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL);
301 if (!monc->con)
302 return -ENOMEM;
303 ceph_con_init(monc->client->msgr, monc->con);
304 monc->con->private = monc;
305 monc->con->ops = &mon_con_ops;
306 }
307
308 mutex_lock(&monc->mutex);
309 __open_session(monc);
310 __schedule_delayed(monc);
311 mutex_unlock(&monc->mutex);
312 return 0;
313}
314
315/*
316 * The monitor responds with mount ack indicate mount success. The
317 * included client ticket allows the client to talk to MDSs and OSDs.
318 */
319static void ceph_monc_handle_map(struct ceph_mon_client *monc,
320 struct ceph_msg *msg)
321{
322 struct ceph_client *client = monc->client;
323 struct ceph_monmap *monmap = NULL, *old = monc->monmap;
324 void *p, *end;
325
326 mutex_lock(&monc->mutex);
327
328 dout("handle_monmap\n");
329 p = msg->front.iov_base;
330 end = p + msg->front.iov_len;
331
332 monmap = ceph_monmap_decode(p, end);
333 if (IS_ERR(monmap)) {
334 pr_err("problem decoding monmap, %d\n",
335 (int)PTR_ERR(monmap));
336 goto out;
337 }
338
339 if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) {
340 kfree(monmap);
341 goto out;
342 }
343
344 client->monc.monmap = monmap;
345 kfree(old);
346
347out:
348 mutex_unlock(&monc->mutex);
349 wake_up(&client->auth_wq);
350}
351
352/*
353 * statfs
354 */
355static struct ceph_mon_statfs_request *__lookup_statfs(
356 struct ceph_mon_client *monc, u64 tid)
357{
358 struct ceph_mon_statfs_request *req;
359 struct rb_node *n = monc->statfs_request_tree.rb_node;
360
361 while (n) {
362 req = rb_entry(n, struct ceph_mon_statfs_request, node);
363 if (tid < req->tid)
364 n = n->rb_left;
365 else if (tid > req->tid)
366 n = n->rb_right;
367 else
368 return req;
369 }
370 return NULL;
371}
372
373static void __insert_statfs(struct ceph_mon_client *monc,
374 struct ceph_mon_statfs_request *new)
375{
376 struct rb_node **p = &monc->statfs_request_tree.rb_node;
377 struct rb_node *parent = NULL;
378 struct ceph_mon_statfs_request *req = NULL;
379
380 while (*p) {
381 parent = *p;
382 req = rb_entry(parent, struct ceph_mon_statfs_request, node);
383 if (new->tid < req->tid)
384 p = &(*p)->rb_left;
385 else if (new->tid > req->tid)
386 p = &(*p)->rb_right;
387 else
388 BUG();
389 }
390
391 rb_link_node(&new->node, parent, p);
392 rb_insert_color(&new->node, &monc->statfs_request_tree);
393}
394
395static void handle_statfs_reply(struct ceph_mon_client *monc,
396 struct ceph_msg *msg)
397{
398 struct ceph_mon_statfs_request *req;
399 struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
400 u64 tid;
401
402 if (msg->front.iov_len != sizeof(*reply))
403 goto bad;
404 tid = le64_to_cpu(msg->hdr.tid);
405 dout("handle_statfs_reply %p tid %llu\n", msg, tid);
406
407 mutex_lock(&monc->mutex);
408 req = __lookup_statfs(monc, tid);
409 if (req) {
410 *req->buf = reply->st;
411 req->result = 0;
412 }
413 mutex_unlock(&monc->mutex);
414 if (req)
415 complete(&req->completion);
416 return;
417
418bad:
419 pr_err("corrupt statfs reply, no tid\n");
420 ceph_msg_dump(msg);
421}
422
423/*
424 * (re)send a statfs request
425 */
426static int send_statfs(struct ceph_mon_client *monc,
427 struct ceph_mon_statfs_request *req)
428{
429 struct ceph_msg *msg;
430 struct ceph_mon_statfs *h;
431
432 dout("send_statfs tid %llu\n", req->tid);
433 msg = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), 0, 0, NULL);
434 if (IS_ERR(msg))
435 return PTR_ERR(msg);
436 req->request = msg;
437 msg->hdr.tid = cpu_to_le64(req->tid);
438 h = msg->front.iov_base;
439 h->monhdr.have_version = 0;
440 h->monhdr.session_mon = cpu_to_le16(-1);
441 h->monhdr.session_mon_tid = 0;
442 h->fsid = monc->monmap->fsid;
443 ceph_con_send(monc->con, msg);
444 return 0;
445}
446
447/*
448 * Do a synchronous statfs().
449 */
450int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
451{
452 struct ceph_mon_statfs_request req;
453 int err;
454
455 req.buf = buf;
456 init_completion(&req.completion);
457
458 /* allocate memory for reply */
459 err = ceph_msgpool_resv(&monc->msgpool_statfs_reply, 1);
460 if (err)
461 return err;
462
463 /* register request */
464 mutex_lock(&monc->mutex);
465 req.tid = ++monc->last_tid;
466 req.last_attempt = jiffies;
467 req.delay = BASE_DELAY_INTERVAL;
468 __insert_statfs(monc, &req);
469 monc->num_statfs_requests++;
470 mutex_unlock(&monc->mutex);
471
472 /* send request and wait */
473 err = send_statfs(monc, &req);
474 if (!err)
475 err = wait_for_completion_interruptible(&req.completion);
476
477 mutex_lock(&monc->mutex);
478 rb_erase(&req.node, &monc->statfs_request_tree);
479 monc->num_statfs_requests--;
480 ceph_msgpool_resv(&monc->msgpool_statfs_reply, -1);
481 mutex_unlock(&monc->mutex);
482
483 if (!err)
484 err = req.result;
485 return err;
486}
487
488/*
489 * Resend pending statfs requests.
490 */
491static void __resend_statfs(struct ceph_mon_client *monc)
492{
493 struct ceph_mon_statfs_request *req;
494 struct rb_node *p;
495
496 for (p = rb_first(&monc->statfs_request_tree); p; p = rb_next(p)) {
497 req = rb_entry(p, struct ceph_mon_statfs_request, node);
498 send_statfs(monc, req);
499 }
500}
501
502/*
503 * Delayed work. If we haven't mounted yet, retry. Otherwise,
504 * renew/retry subscription as needed (in case it is timing out, or we
505 * got an ENOMEM). And keep the monitor connection alive.
506 */
507static void delayed_work(struct work_struct *work)
508{
509 struct ceph_mon_client *monc =
510 container_of(work, struct ceph_mon_client, delayed_work.work);
511
512 dout("monc delayed_work\n");
513 mutex_lock(&monc->mutex);
514 if (monc->hunting) {
515 __close_session(monc);
516 __open_session(monc); /* continue hunting */
517 } else {
518 ceph_con_keepalive(monc->con);
519
520 __validate_auth(monc);
521
522 if (monc->auth->ops->is_authenticated(monc->auth))
523 __send_subscribe(monc);
524 }
525 __schedule_delayed(monc);
526 mutex_unlock(&monc->mutex);
527}
528
529/*
530 * On startup, we build a temporary monmap populated with the IPs
531 * provided by mount(2).
532 */
533static int build_initial_monmap(struct ceph_mon_client *monc)
534{
535 struct ceph_mount_args *args = monc->client->mount_args;
536 struct ceph_entity_addr *mon_addr = args->mon_addr;
537 int num_mon = args->num_mon;
538 int i;
539
540 /* build initial monmap */
541 monc->monmap = kzalloc(sizeof(*monc->monmap) +
542 num_mon*sizeof(monc->monmap->mon_inst[0]),
543 GFP_KERNEL);
544 if (!monc->monmap)
545 return -ENOMEM;
546 for (i = 0; i < num_mon; i++) {
547 monc->monmap->mon_inst[i].addr = mon_addr[i];
548 monc->monmap->mon_inst[i].addr.nonce = 0;
549 monc->monmap->mon_inst[i].name.type =
550 CEPH_ENTITY_TYPE_MON;
551 monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
552 }
553 monc->monmap->num_mon = num_mon;
554 monc->have_fsid = false;
555
556 /* release addr memory */
557 kfree(args->mon_addr);
558 args->mon_addr = NULL;
559 args->num_mon = 0;
560 return 0;
561}
562
563int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
564{
565 int err = 0;
566
567 dout("init\n");
568 memset(monc, 0, sizeof(*monc));
569 monc->client = cl;
570 monc->monmap = NULL;
571 mutex_init(&monc->mutex);
572
573 err = build_initial_monmap(monc);
574 if (err)
575 goto out;
576
577 monc->con = NULL;
578
579 /* authentication */
580 monc->auth = ceph_auth_init(cl->mount_args->name,
581 cl->mount_args->secret);
582 if (IS_ERR(monc->auth))
583 return PTR_ERR(monc->auth);
584 monc->auth->want_keys =
585 CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
586 CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
587
588 /* msg pools */
589 err = ceph_msgpool_init(&monc->msgpool_subscribe_ack,
590 sizeof(struct ceph_mon_subscribe_ack), 1, false);
591 if (err < 0)
592 goto out_monmap;
593 err = ceph_msgpool_init(&monc->msgpool_statfs_reply,
594 sizeof(struct ceph_mon_statfs_reply), 0, false);
595 if (err < 0)
596 goto out_pool1;
597 err = ceph_msgpool_init(&monc->msgpool_auth_reply, 4096, 1, false);
598 if (err < 0)
599 goto out_pool2;
600
601 monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, 0, 0, NULL);
602 monc->pending_auth = 0;
603 if (IS_ERR(monc->m_auth)) {
604 err = PTR_ERR(monc->m_auth);
605 monc->m_auth = NULL;
606 goto out_pool3;
607 }
608
609 monc->cur_mon = -1;
610 monc->hunting = true;
611 monc->sub_renew_after = jiffies;
612 monc->sub_sent = 0;
613
614 INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
615 monc->statfs_request_tree = RB_ROOT;
616 monc->num_statfs_requests = 0;
617 monc->last_tid = 0;
618
619 monc->have_mdsmap = 0;
620 monc->have_osdmap = 0;
621 monc->want_next_osdmap = 1;
622 return 0;
623
624out_pool3:
625 ceph_msgpool_destroy(&monc->msgpool_auth_reply);
626out_pool2:
627 ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
628out_pool1:
629 ceph_msgpool_destroy(&monc->msgpool_statfs_reply);
630out_monmap:
631 kfree(monc->monmap);
632out:
633 return err;
634}
635
636void ceph_monc_stop(struct ceph_mon_client *monc)
637{
638 dout("stop\n");
639 cancel_delayed_work_sync(&monc->delayed_work);
640
641 mutex_lock(&monc->mutex);
642 __close_session(monc);
643 if (monc->con) {
644 monc->con->private = NULL;
645 monc->con->ops->put(monc->con);
646 monc->con = NULL;
647 }
648 mutex_unlock(&monc->mutex);
649
650 ceph_auth_destroy(monc->auth);
651
652 ceph_msg_put(monc->m_auth);
653 ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
654 ceph_msgpool_destroy(&monc->msgpool_statfs_reply);
655 ceph_msgpool_destroy(&monc->msgpool_auth_reply);
656
657 kfree(monc->monmap);
658}
659
660static void handle_auth_reply(struct ceph_mon_client *monc,
661 struct ceph_msg *msg)
662{
663 int ret;
664
665 mutex_lock(&monc->mutex);
666 monc->pending_auth = 0;
667 ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
668 msg->front.iov_len,
669 monc->m_auth->front.iov_base,
670 monc->m_auth->front_max);
671 if (ret < 0) {
672 monc->client->auth_err = ret;
673 wake_up(&monc->client->auth_wq);
674 } else if (ret > 0) {
675 __send_prepared_auth_request(monc, ret);
676 } else if (monc->auth->ops->is_authenticated(monc->auth)) {
677 dout("authenticated, starting session\n");
678
679 monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
680 monc->client->msgr->inst.name.num = monc->auth->global_id;
681
682 __send_subscribe(monc);
683 __resend_statfs(monc);
684 }
685 mutex_unlock(&monc->mutex);
686}
687
688static int __validate_auth(struct ceph_mon_client *monc)
689{
690 int ret;
691
692 if (monc->pending_auth)
693 return 0;
694
695 ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base,
696 monc->m_auth->front_max);
697 if (ret <= 0)
698 return ret; /* either an error, or no need to authenticate */
699 __send_prepared_auth_request(monc, ret);
700 return 0;
701}
702
703int ceph_monc_validate_auth(struct ceph_mon_client *monc)
704{
705 int ret;
706
707 mutex_lock(&monc->mutex);
708 ret = __validate_auth(monc);
709 mutex_unlock(&monc->mutex);
710 return ret;
711}
712
713/*
714 * handle incoming message
715 */
716static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
717{
718 struct ceph_mon_client *monc = con->private;
719 int type = le16_to_cpu(msg->hdr.type);
720
721 if (!monc)
722 return;
723
724 switch (type) {
725 case CEPH_MSG_AUTH_REPLY:
726 handle_auth_reply(monc, msg);
727 break;
728
729 case CEPH_MSG_MON_SUBSCRIBE_ACK:
730 handle_subscribe_ack(monc, msg);
731 break;
732
733 case CEPH_MSG_STATFS_REPLY:
734 handle_statfs_reply(monc, msg);
735 break;
736
737 case CEPH_MSG_MON_MAP:
738 ceph_monc_handle_map(monc, msg);
739 break;
740
741 case CEPH_MSG_MDS_MAP:
742 ceph_mdsc_handle_map(&monc->client->mdsc, msg);
743 break;
744
745 case CEPH_MSG_OSD_MAP:
746 ceph_osdc_handle_map(&monc->client->osdc, msg);
747 break;
748
749 default:
750 pr_err("received unknown message type %d %s\n", type,
751 ceph_msg_type_name(type));
752 }
753 ceph_msg_put(msg);
754}
755
756/*
757 * Allocate memory for incoming message
758 */
759static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
760 struct ceph_msg_header *hdr,
761 int *skip)
762{
763 struct ceph_mon_client *monc = con->private;
764 int type = le16_to_cpu(hdr->type);
765 int front_len = le32_to_cpu(hdr->front_len);
766 struct ceph_msg *m = NULL;
767
768 *skip = 0;
769
770 switch (type) {
771 case CEPH_MSG_MON_SUBSCRIBE_ACK:
772 m = ceph_msgpool_get(&monc->msgpool_subscribe_ack, front_len);
773 break;
774 case CEPH_MSG_STATFS_REPLY:
775 m = ceph_msgpool_get(&monc->msgpool_statfs_reply, front_len);
776 break;
777 case CEPH_MSG_AUTH_REPLY:
778 m = ceph_msgpool_get(&monc->msgpool_auth_reply, front_len);
779 break;
780 case CEPH_MSG_MON_MAP:
781 case CEPH_MSG_MDS_MAP:
782 case CEPH_MSG_OSD_MAP:
783 m = ceph_msg_new(type, front_len, 0, 0, NULL);
784 break;
785 }
786
787 if (!m) {
788 pr_info("alloc_msg unknown type %d\n", type);
789 *skip = 1;
790 }
791 return m;
792}
793
794/*
795 * If the monitor connection resets, pick a new monitor and resubmit
796 * any pending requests.
797 */
798static void mon_fault(struct ceph_connection *con)
799{
800 struct ceph_mon_client *monc = con->private;
801
802 if (!monc)
803 return;
804
805 dout("mon_fault\n");
806 mutex_lock(&monc->mutex);
807 if (!con->private)
808 goto out;
809
810 if (monc->con && !monc->hunting)
811 pr_info("mon%d %s session lost, "
812 "hunting for new mon\n", monc->cur_mon,
813 pr_addr(&monc->con->peer_addr.in_addr));
814
815 __close_session(monc);
816 if (!monc->hunting) {
817 /* start hunting */
818 monc->hunting = true;
819 __open_session(monc);
820 } else {
821 /* already hunting, let's wait a bit */
822 __schedule_delayed(monc);
823 }
824out:
825 mutex_unlock(&monc->mutex);
826}
827
828const static struct ceph_connection_operations mon_con_ops = {
829 .get = ceph_con_get,
830 .put = ceph_con_put,
831 .dispatch = dispatch,
832 .fault = mon_fault,
833 .alloc_msg = mon_alloc_msg,
834};
diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h
new file mode 100644
index 000000000000..b958ad5afa06
--- /dev/null
+++ b/fs/ceph/mon_client.h
@@ -0,0 +1,119 @@
1#ifndef _FS_CEPH_MON_CLIENT_H
2#define _FS_CEPH_MON_CLIENT_H
3
4#include <linux/completion.h>
5#include <linux/rbtree.h>
6
7#include "messenger.h"
8#include "msgpool.h"
9
10struct ceph_client;
11struct ceph_mount_args;
12struct ceph_auth_client;
13
14/*
15 * The monitor map enumerates the set of all monitors.
16 */
17struct ceph_monmap {
18 struct ceph_fsid fsid;
19 u32 epoch;
20 u32 num_mon;
21 struct ceph_entity_inst mon_inst[0];
22};
23
24struct ceph_mon_client;
25struct ceph_mon_statfs_request;
26
27
28/*
29 * Generic mechanism for resending monitor requests.
30 */
31typedef void (*ceph_monc_request_func_t)(struct ceph_mon_client *monc,
32 int newmon);
33
34/* a pending monitor request */
35struct ceph_mon_request {
36 struct ceph_mon_client *monc;
37 struct delayed_work delayed_work;
38 unsigned long delay;
39 ceph_monc_request_func_t do_request;
40};
41
42/*
43 * statfs() is done a bit differently because we need to get data back
44 * to the caller
45 */
46struct ceph_mon_statfs_request {
47 u64 tid;
48 struct rb_node node;
49 int result;
50 struct ceph_statfs *buf;
51 struct completion completion;
52 unsigned long last_attempt, delay; /* jiffies */
53 struct ceph_msg *request; /* original request */
54};
55
56struct ceph_mon_client {
57 struct ceph_client *client;
58 struct ceph_monmap *monmap;
59
60 struct mutex mutex;
61 struct delayed_work delayed_work;
62
63 struct ceph_auth_client *auth;
64 struct ceph_msg *m_auth;
65 int pending_auth;
66
67 bool hunting;
68 int cur_mon; /* last monitor i contacted */
69 unsigned long sub_sent, sub_renew_after;
70 struct ceph_connection *con;
71 bool have_fsid;
72
73 /* msg pools */
74 struct ceph_msgpool msgpool_subscribe_ack;
75 struct ceph_msgpool msgpool_statfs_reply;
76 struct ceph_msgpool msgpool_auth_reply;
77
78 /* pending statfs requests */
79 struct rb_root statfs_request_tree;
80 int num_statfs_requests;
81 u64 last_tid;
82
83 /* mds/osd map */
84 int want_next_osdmap; /* 1 = want, 2 = want+asked */
85 u32 have_osdmap, have_mdsmap;
86
87#ifdef CONFIG_DEBUG_FS
88 struct dentry *debugfs_file;
89#endif
90};
91
92extern struct ceph_monmap *ceph_monmap_decode(void *p, void *end);
93extern int ceph_monmap_contains(struct ceph_monmap *m,
94 struct ceph_entity_addr *addr);
95
96extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl);
97extern void ceph_monc_stop(struct ceph_mon_client *monc);
98
99/*
100 * The model here is to indicate that we need a new map of at least
101 * epoch @want, and also call in when we receive a map. We will
102 * periodically rerequest the map from the monitor cluster until we
103 * get what we want.
104 */
105extern int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 have);
106extern int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 have);
107
108extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
109
110extern int ceph_monc_do_statfs(struct ceph_mon_client *monc,
111 struct ceph_statfs *buf);
112
113extern int ceph_monc_open_session(struct ceph_mon_client *monc);
114
115extern int ceph_monc_validate_auth(struct ceph_mon_client *monc);
116
117
118
119#endif
diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c
new file mode 100644
index 000000000000..ca3b44a89f2d
--- /dev/null
+++ b/fs/ceph/msgpool.c
@@ -0,0 +1,186 @@
1#include "ceph_debug.h"
2
3#include <linux/err.h>
4#include <linux/sched.h>
5#include <linux/types.h>
6#include <linux/vmalloc.h>
7
8#include "msgpool.h"
9
10/*
11 * We use msg pools to preallocate memory for messages we expect to
12 * receive over the wire, to avoid getting ourselves into OOM
13 * conditions at unexpected times. We take use a few different
14 * strategies:
15 *
16 * - for request/response type interactions, we preallocate the
17 * memory needed for the response when we generate the request.
18 *
19 * - for messages we can receive at any time from the MDS, we preallocate
20 * a pool of messages we can re-use.
21 *
22 * - for writeback, we preallocate some number of messages to use for
23 * requests and their replies, so that we always make forward
24 * progress.
25 *
26 * The msgpool behaves like a mempool_t, but keeps preallocated
27 * ceph_msgs strung together on a list_head instead of using a pointer
28 * vector. This avoids vector reallocation when we adjust the number
29 * of preallocated items (which happens frequently).
30 */
31
32
33/*
34 * Allocate or release as necessary to meet our target pool size.
35 */
36static int __fill_msgpool(struct ceph_msgpool *pool)
37{
38 struct ceph_msg *msg;
39
40 while (pool->num < pool->min) {
41 dout("fill_msgpool %p %d/%d allocating\n", pool, pool->num,
42 pool->min);
43 spin_unlock(&pool->lock);
44 msg = ceph_msg_new(0, pool->front_len, 0, 0, NULL);
45 spin_lock(&pool->lock);
46 if (IS_ERR(msg))
47 return PTR_ERR(msg);
48 msg->pool = pool;
49 list_add(&msg->list_head, &pool->msgs);
50 pool->num++;
51 }
52 while (pool->num > pool->min) {
53 msg = list_first_entry(&pool->msgs, struct ceph_msg, list_head);
54 dout("fill_msgpool %p %d/%d releasing %p\n", pool, pool->num,
55 pool->min, msg);
56 list_del_init(&msg->list_head);
57 pool->num--;
58 ceph_msg_kfree(msg);
59 }
60 return 0;
61}
62
63int ceph_msgpool_init(struct ceph_msgpool *pool,
64 int front_len, int min, bool blocking)
65{
66 int ret;
67
68 dout("msgpool_init %p front_len %d min %d\n", pool, front_len, min);
69 spin_lock_init(&pool->lock);
70 pool->front_len = front_len;
71 INIT_LIST_HEAD(&pool->msgs);
72 pool->num = 0;
73 pool->min = min;
74 pool->blocking = blocking;
75 init_waitqueue_head(&pool->wait);
76
77 spin_lock(&pool->lock);
78 ret = __fill_msgpool(pool);
79 spin_unlock(&pool->lock);
80 return ret;
81}
82
83void ceph_msgpool_destroy(struct ceph_msgpool *pool)
84{
85 dout("msgpool_destroy %p\n", pool);
86 spin_lock(&pool->lock);
87 pool->min = 0;
88 __fill_msgpool(pool);
89 spin_unlock(&pool->lock);
90}
91
92int ceph_msgpool_resv(struct ceph_msgpool *pool, int delta)
93{
94 int ret;
95
96 spin_lock(&pool->lock);
97 dout("msgpool_resv %p delta %d\n", pool, delta);
98 pool->min += delta;
99 ret = __fill_msgpool(pool);
100 spin_unlock(&pool->lock);
101 return ret;
102}
103
104struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len)
105{
106 wait_queue_t wait;
107 struct ceph_msg *msg;
108
109 if (front_len && front_len > pool->front_len) {
110 pr_err("msgpool_get pool %p need front %d, pool size is %d\n",
111 pool, front_len, pool->front_len);
112 WARN_ON(1);
113
114 /* try to alloc a fresh message */
115 msg = ceph_msg_new(0, front_len, 0, 0, NULL);
116 if (!IS_ERR(msg))
117 return msg;
118 }
119
120 if (!front_len)
121 front_len = pool->front_len;
122
123 if (pool->blocking) {
124 /* mempool_t behavior; first try to alloc */
125 msg = ceph_msg_new(0, front_len, 0, 0, NULL);
126 if (!IS_ERR(msg))
127 return msg;
128 }
129
130 while (1) {
131 spin_lock(&pool->lock);
132 if (likely(pool->num)) {
133 msg = list_entry(pool->msgs.next, struct ceph_msg,
134 list_head);
135 list_del_init(&msg->list_head);
136 pool->num--;
137 dout("msgpool_get %p got %p, now %d/%d\n", pool, msg,
138 pool->num, pool->min);
139 spin_unlock(&pool->lock);
140 return msg;
141 }
142 pr_err("msgpool_get %p now %d/%d, %s\n", pool, pool->num,
143 pool->min, pool->blocking ? "waiting" : "may fail");
144 spin_unlock(&pool->lock);
145
146 if (!pool->blocking) {
147 WARN_ON(1);
148
149 /* maybe we can allocate it now? */
150 msg = ceph_msg_new(0, front_len, 0, 0, NULL);
151 if (!IS_ERR(msg))
152 return msg;
153
154 pr_err("msgpool_get %p empty + alloc failed\n", pool);
155 return ERR_PTR(-ENOMEM);
156 }
157
158 init_wait(&wait);
159 prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
160 schedule();
161 finish_wait(&pool->wait, &wait);
162 }
163}
164
165void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
166{
167 spin_lock(&pool->lock);
168 if (pool->num < pool->min) {
169 /* reset msg front_len; user may have changed it */
170 msg->front.iov_len = pool->front_len;
171 msg->hdr.front_len = cpu_to_le32(pool->front_len);
172
173 kref_set(&msg->kref, 1); /* retake a single ref */
174 list_add(&msg->list_head, &pool->msgs);
175 pool->num++;
176 dout("msgpool_put %p reclaim %p, now %d/%d\n", pool, msg,
177 pool->num, pool->min);
178 spin_unlock(&pool->lock);
179 wake_up(&pool->wait);
180 } else {
181 dout("msgpool_put %p drop %p, at %d/%d\n", pool, msg,
182 pool->num, pool->min);
183 spin_unlock(&pool->lock);
184 ceph_msg_kfree(msg);
185 }
186}
diff --git a/fs/ceph/msgpool.h b/fs/ceph/msgpool.h
new file mode 100644
index 000000000000..bc834bfcd720
--- /dev/null
+++ b/fs/ceph/msgpool.h
@@ -0,0 +1,27 @@
1#ifndef _FS_CEPH_MSGPOOL
2#define _FS_CEPH_MSGPOOL
3
4#include "messenger.h"
5
6/*
7 * we use memory pools for preallocating messages we may receive, to
8 * avoid unexpected OOM conditions.
9 */
10struct ceph_msgpool {
11 spinlock_t lock;
12 int front_len; /* preallocated payload size */
13 struct list_head msgs; /* msgs in the pool; each has 1 ref */
14 int num, min; /* cur, min # msgs in the pool */
15 bool blocking;
16 wait_queue_head_t wait;
17};
18
19extern int ceph_msgpool_init(struct ceph_msgpool *pool,
20 int front_len, int size, bool blocking);
21extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
22extern int ceph_msgpool_resv(struct ceph_msgpool *, int delta);
23extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *,
24 int front_len);
25extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);
26
27#endif
diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h
new file mode 100644
index 000000000000..8aaab414f3f8
--- /dev/null
+++ b/fs/ceph/msgr.h
@@ -0,0 +1,158 @@
1#ifndef __MSGR_H
2#define __MSGR_H
3
4/*
5 * Data types for message passing layer used by Ceph.
6 */
7
8#define CEPH_MON_PORT 6789 /* default monitor port */
9
10/*
11 * client-side processes will try to bind to ports in this
12 * range, simply for the benefit of tools like nmap or wireshark
13 * that would like to identify the protocol.
14 */
15#define CEPH_PORT_FIRST 6789
16#define CEPH_PORT_START 6800 /* non-monitors start here */
17#define CEPH_PORT_LAST 6900
18
19/*
20 * tcp connection banner. include a protocol version. and adjust
21 * whenever the wire protocol changes. try to keep this string length
22 * constant.
23 */
24#define CEPH_BANNER "ceph v027"
25#define CEPH_BANNER_MAX_LEN 30
26
27
28/*
29 * Rollover-safe type and comparator for 32-bit sequence numbers.
30 * Comparator returns -1, 0, or 1.
31 */
32typedef __u32 ceph_seq_t;
33
34static inline __s32 ceph_seq_cmp(__u32 a, __u32 b)
35{
36 return (__s32)a - (__s32)b;
37}
38
39
40/*
41 * entity_name -- logical name for a process participating in the
42 * network, e.g. 'mds0' or 'osd3'.
43 */
44struct ceph_entity_name {
45 __u8 type; /* CEPH_ENTITY_TYPE_* */
46 __le64 num;
47} __attribute__ ((packed));
48
49#define CEPH_ENTITY_TYPE_MON 0x01
50#define CEPH_ENTITY_TYPE_MDS 0x02
51#define CEPH_ENTITY_TYPE_OSD 0x04
52#define CEPH_ENTITY_TYPE_CLIENT 0x08
53#define CEPH_ENTITY_TYPE_ADMIN 0x10
54#define CEPH_ENTITY_TYPE_AUTH 0x20
55
56#define CEPH_ENTITY_TYPE_ANY 0xFF
57
58extern const char *ceph_entity_type_name(int type);
59
60/*
61 * entity_addr -- network address
62 */
63struct ceph_entity_addr {
64 __le32 type;
65 __le32 nonce; /* unique id for process (e.g. pid) */
66 struct sockaddr_storage in_addr;
67} __attribute__ ((packed));
68
69struct ceph_entity_inst {
70 struct ceph_entity_name name;
71 struct ceph_entity_addr addr;
72} __attribute__ ((packed));
73
74
75/* used by message exchange protocol */
76#define CEPH_MSGR_TAG_READY 1 /* server->client: ready for messages */
77#define CEPH_MSGR_TAG_RESETSESSION 2 /* server->client: reset, try again */
78#define CEPH_MSGR_TAG_WAIT 3 /* server->client: wait for racing
79 incoming connection */
80#define CEPH_MSGR_TAG_RETRY_SESSION 4 /* server->client + cseq: try again
81 with higher cseq */
82#define CEPH_MSGR_TAG_RETRY_GLOBAL 5 /* server->client + gseq: try again
83 with higher gseq */
84#define CEPH_MSGR_TAG_CLOSE 6 /* closing pipe */
85#define CEPH_MSGR_TAG_MSG 7 /* message */
86#define CEPH_MSGR_TAG_ACK 8 /* message ack */
87#define CEPH_MSGR_TAG_KEEPALIVE 9 /* just a keepalive byte! */
88#define CEPH_MSGR_TAG_BADPROTOVER 10 /* bad protocol version */
89#define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */
90#define CEPH_MSGR_TAG_FEATURES 12 /* insufficient features */
91
92
93/*
94 * connection negotiation
95 */
96struct ceph_msg_connect {
97 __le64 features; /* supported feature bits */
98 __le32 host_type; /* CEPH_ENTITY_TYPE_* */
99 __le32 global_seq; /* count connections initiated by this host */
100 __le32 connect_seq; /* count connections initiated in this session */
101 __le32 protocol_version;
102 __le32 authorizer_protocol;
103 __le32 authorizer_len;
104 __u8 flags; /* CEPH_MSG_CONNECT_* */
105} __attribute__ ((packed));
106
107struct ceph_msg_connect_reply {
108 __u8 tag;
109 __le64 features; /* feature bits for this session */
110 __le32 global_seq;
111 __le32 connect_seq;
112 __le32 protocol_version;
113 __le32 authorizer_len;
114 __u8 flags;
115} __attribute__ ((packed));
116
117#define CEPH_MSG_CONNECT_LOSSY 1 /* messages i send may be safely dropped */
118
119
120/*
121 * message header
122 */
123struct ceph_msg_header {
124 __le64 seq; /* message seq# for this session */
125 __le64 tid; /* transaction id */
126 __le16 type; /* message type */
127 __le16 priority; /* priority. higher value == higher priority */
128 __le16 version; /* version of message encoding */
129
130 __le32 front_len; /* bytes in main payload */
131 __le32 middle_len;/* bytes in middle payload */
132 __le32 data_len; /* bytes of data payload */
133 __le16 data_off; /* sender: include full offset;
134 receiver: mask against ~PAGE_MASK */
135
136 struct ceph_entity_inst src, orig_src;
137 __le32 reserved;
138 __le32 crc; /* header crc32c */
139} __attribute__ ((packed));
140
141#define CEPH_MSG_PRIO_LOW 64
142#define CEPH_MSG_PRIO_DEFAULT 127
143#define CEPH_MSG_PRIO_HIGH 196
144#define CEPH_MSG_PRIO_HIGHEST 255
145
146/*
147 * follows data payload
148 */
149struct ceph_msg_footer {
150 __le32 front_crc, middle_crc, data_crc;
151 __u8 flags;
152} __attribute__ ((packed));
153
154#define CEPH_MSG_FOOTER_COMPLETE (1<<0) /* msg wasn't aborted */
155#define CEPH_MSG_FOOTER_NOCRC (1<<1) /* no data crc */
156
157
158#endif
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
new file mode 100644
index 000000000000..dbe63db9762f
--- /dev/null
+++ b/fs/ceph/osd_client.c
@@ -0,0 +1,1537 @@
1#include "ceph_debug.h"
2
3#include <linux/err.h>
4#include <linux/highmem.h>
5#include <linux/mm.h>
6#include <linux/pagemap.h>
7#include <linux/slab.h>
8#include <linux/uaccess.h>
9
10#include "super.h"
11#include "osd_client.h"
12#include "messenger.h"
13#include "decode.h"
14#include "auth.h"
15
16#define OSD_OP_FRONT_LEN 4096
17#define OSD_OPREPLY_FRONT_LEN 512
18
19const static struct ceph_connection_operations osd_con_ops;
20static int __kick_requests(struct ceph_osd_client *osdc,
21 struct ceph_osd *kickosd);
22
23static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd);
24
25/*
26 * Implement client access to distributed object storage cluster.
27 *
28 * All data objects are stored within a cluster/cloud of OSDs, or
29 * "object storage devices." (Note that Ceph OSDs have _nothing_ to
30 * do with the T10 OSD extensions to SCSI.) Ceph OSDs are simply
31 * remote daemons serving up and coordinating consistent and safe
32 * access to storage.
33 *
34 * Cluster membership and the mapping of data objects onto storage devices
35 * are described by the osd map.
36 *
37 * We keep track of pending OSD requests (read, write), resubmit
38 * requests to different OSDs when the cluster topology/data layout
39 * change, or retry the affected requests when the communications
40 * channel with an OSD is reset.
41 */
42
43/*
44 * calculate the mapping of a file extent onto an object, and fill out the
45 * request accordingly. shorten extent as necessary if it crosses an
46 * object boundary.
47 *
48 * fill osd op in request message.
49 */
50static void calc_layout(struct ceph_osd_client *osdc,
51 struct ceph_vino vino, struct ceph_file_layout *layout,
52 u64 off, u64 *plen,
53 struct ceph_osd_request *req)
54{
55 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
56 struct ceph_osd_op *op = (void *)(reqhead + 1);
57 u64 orig_len = *plen;
58 u64 objoff, objlen; /* extent in object */
59 u64 bno;
60
61 reqhead->snapid = cpu_to_le64(vino.snap);
62
63 /* object extent? */
64 ceph_calc_file_object_mapping(layout, off, plen, &bno,
65 &objoff, &objlen);
66 if (*plen < orig_len)
67 dout(" skipping last %llu, final file extent %llu~%llu\n",
68 orig_len - *plen, off, *plen);
69
70 sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno);
71 req->r_oid_len = strlen(req->r_oid);
72
73 op->extent.offset = cpu_to_le64(objoff);
74 op->extent.length = cpu_to_le64(objlen);
75 req->r_num_pages = calc_pages_for(off, *plen);
76
77 dout("calc_layout %s (%d) %llu~%llu (%d pages)\n",
78 req->r_oid, req->r_oid_len, objoff, objlen, req->r_num_pages);
79}
80
81/*
82 * requests
83 */
84void ceph_osdc_release_request(struct kref *kref)
85{
86 struct ceph_osd_request *req = container_of(kref,
87 struct ceph_osd_request,
88 r_kref);
89
90 if (req->r_request)
91 ceph_msg_put(req->r_request);
92 if (req->r_reply)
93 ceph_msg_put(req->r_reply);
94 if (req->r_con_filling_msg) {
95 dout("release_request revoking pages %p from con %p\n",
96 req->r_pages, req->r_con_filling_msg);
97 ceph_con_revoke_message(req->r_con_filling_msg,
98 req->r_reply);
99 ceph_con_put(req->r_con_filling_msg);
100 }
101 if (req->r_own_pages)
102 ceph_release_page_vector(req->r_pages,
103 req->r_num_pages);
104 ceph_put_snap_context(req->r_snapc);
105 if (req->r_mempool)
106 mempool_free(req, req->r_osdc->req_mempool);
107 else
108 kfree(req);
109}
110
111/*
112 * build new request AND message, calculate layout, and adjust file
113 * extent as needed.
114 *
115 * if the file was recently truncated, we include information about its
116 * old and new size so that the object can be updated appropriately. (we
117 * avoid synchronously deleting truncated objects because it's slow.)
118 *
119 * if @do_sync, include a 'startsync' command so that the osd will flush
120 * data quickly.
121 */
122struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
123 struct ceph_file_layout *layout,
124 struct ceph_vino vino,
125 u64 off, u64 *plen,
126 int opcode, int flags,
127 struct ceph_snap_context *snapc,
128 int do_sync,
129 u32 truncate_seq,
130 u64 truncate_size,
131 struct timespec *mtime,
132 bool use_mempool, int num_reply)
133{
134 struct ceph_osd_request *req;
135 struct ceph_msg *msg;
136 struct ceph_osd_request_head *head;
137 struct ceph_osd_op *op;
138 void *p;
139 int num_op = 1 + do_sync;
140 size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
141 int i;
142
143 if (use_mempool) {
144 req = mempool_alloc(osdc->req_mempool, GFP_NOFS);
145 memset(req, 0, sizeof(*req));
146 } else {
147 req = kzalloc(sizeof(*req), GFP_NOFS);
148 }
149 if (req == NULL)
150 return ERR_PTR(-ENOMEM);
151
152 req->r_osdc = osdc;
153 req->r_mempool = use_mempool;
154 kref_init(&req->r_kref);
155 init_completion(&req->r_completion);
156 init_completion(&req->r_safe_completion);
157 INIT_LIST_HEAD(&req->r_unsafe_item);
158 req->r_flags = flags;
159
160 WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
161
162 /* create reply message */
163 if (use_mempool)
164 msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
165 else
166 msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
167 OSD_OPREPLY_FRONT_LEN, 0, 0, NULL);
168 if (IS_ERR(msg)) {
169 ceph_osdc_put_request(req);
170 return ERR_PTR(PTR_ERR(msg));
171 }
172 req->r_reply = msg;
173
174 /* create request message; allow space for oid */
175 msg_size += 40;
176 if (snapc)
177 msg_size += sizeof(u64) * snapc->num_snaps;
178 if (use_mempool)
179 msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
180 else
181 msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, 0, 0, NULL);
182 if (IS_ERR(msg)) {
183 ceph_osdc_put_request(req);
184 return ERR_PTR(PTR_ERR(msg));
185 }
186 msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
187 memset(msg->front.iov_base, 0, msg->front.iov_len);
188 head = msg->front.iov_base;
189 op = (void *)(head + 1);
190 p = (void *)(op + num_op);
191
192 req->r_request = msg;
193 req->r_snapc = ceph_get_snap_context(snapc);
194
195 head->client_inc = cpu_to_le32(1); /* always, for now. */
196 head->flags = cpu_to_le32(flags);
197 if (flags & CEPH_OSD_FLAG_WRITE)
198 ceph_encode_timespec(&head->mtime, mtime);
199 head->num_ops = cpu_to_le16(num_op);
200 op->op = cpu_to_le16(opcode);
201
202 /* calculate max write size */
203 calc_layout(osdc, vino, layout, off, plen, req);
204 req->r_file_layout = *layout; /* keep a copy */
205
206 if (flags & CEPH_OSD_FLAG_WRITE) {
207 req->r_request->hdr.data_off = cpu_to_le16(off);
208 req->r_request->hdr.data_len = cpu_to_le32(*plen);
209 op->payload_len = cpu_to_le32(*plen);
210 }
211 op->extent.truncate_size = cpu_to_le64(truncate_size);
212 op->extent.truncate_seq = cpu_to_le32(truncate_seq);
213
214 /* fill in oid */
215 head->object_len = cpu_to_le32(req->r_oid_len);
216 memcpy(p, req->r_oid, req->r_oid_len);
217 p += req->r_oid_len;
218
219 if (do_sync) {
220 op++;
221 op->op = cpu_to_le16(CEPH_OSD_OP_STARTSYNC);
222 }
223 if (snapc) {
224 head->snap_seq = cpu_to_le64(snapc->seq);
225 head->num_snaps = cpu_to_le32(snapc->num_snaps);
226 for (i = 0; i < snapc->num_snaps; i++) {
227 put_unaligned_le64(snapc->snaps[i], p);
228 p += sizeof(u64);
229 }
230 }
231
232 BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
233 msg_size = p - msg->front.iov_base;
234 msg->front.iov_len = msg_size;
235 msg->hdr.front_len = cpu_to_le32(msg_size);
236 return req;
237}
238
239/*
240 * We keep osd requests in an rbtree, sorted by ->r_tid.
241 */
242static void __insert_request(struct ceph_osd_client *osdc,
243 struct ceph_osd_request *new)
244{
245 struct rb_node **p = &osdc->requests.rb_node;
246 struct rb_node *parent = NULL;
247 struct ceph_osd_request *req = NULL;
248
249 while (*p) {
250 parent = *p;
251 req = rb_entry(parent, struct ceph_osd_request, r_node);
252 if (new->r_tid < req->r_tid)
253 p = &(*p)->rb_left;
254 else if (new->r_tid > req->r_tid)
255 p = &(*p)->rb_right;
256 else
257 BUG();
258 }
259
260 rb_link_node(&new->r_node, parent, p);
261 rb_insert_color(&new->r_node, &osdc->requests);
262}
263
264static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc,
265 u64 tid)
266{
267 struct ceph_osd_request *req;
268 struct rb_node *n = osdc->requests.rb_node;
269
270 while (n) {
271 req = rb_entry(n, struct ceph_osd_request, r_node);
272 if (tid < req->r_tid)
273 n = n->rb_left;
274 else if (tid > req->r_tid)
275 n = n->rb_right;
276 else
277 return req;
278 }
279 return NULL;
280}
281
282static struct ceph_osd_request *
283__lookup_request_ge(struct ceph_osd_client *osdc,
284 u64 tid)
285{
286 struct ceph_osd_request *req;
287 struct rb_node *n = osdc->requests.rb_node;
288
289 while (n) {
290 req = rb_entry(n, struct ceph_osd_request, r_node);
291 if (tid < req->r_tid) {
292 if (!n->rb_left)
293 return req;
294 n = n->rb_left;
295 } else if (tid > req->r_tid) {
296 n = n->rb_right;
297 } else {
298 return req;
299 }
300 }
301 return NULL;
302}
303
304
305/*
306 * If the osd connection drops, we need to resubmit all requests.
307 */
308static void osd_reset(struct ceph_connection *con)
309{
310 struct ceph_osd *osd = con->private;
311 struct ceph_osd_client *osdc;
312
313 if (!osd)
314 return;
315 dout("osd_reset osd%d\n", osd->o_osd);
316 osdc = osd->o_osdc;
317 down_read(&osdc->map_sem);
318 kick_requests(osdc, osd);
319 up_read(&osdc->map_sem);
320}
321
322/*
323 * Track open sessions with osds.
324 */
325static struct ceph_osd *create_osd(struct ceph_osd_client *osdc)
326{
327 struct ceph_osd *osd;
328
329 osd = kzalloc(sizeof(*osd), GFP_NOFS);
330 if (!osd)
331 return NULL;
332
333 atomic_set(&osd->o_ref, 1);
334 osd->o_osdc = osdc;
335 INIT_LIST_HEAD(&osd->o_requests);
336 INIT_LIST_HEAD(&osd->o_osd_lru);
337 osd->o_incarnation = 1;
338
339 ceph_con_init(osdc->client->msgr, &osd->o_con);
340 osd->o_con.private = osd;
341 osd->o_con.ops = &osd_con_ops;
342 osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD;
343
344 INIT_LIST_HEAD(&osd->o_keepalive_item);
345 return osd;
346}
347
348static struct ceph_osd *get_osd(struct ceph_osd *osd)
349{
350 if (atomic_inc_not_zero(&osd->o_ref)) {
351 dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1,
352 atomic_read(&osd->o_ref));
353 return osd;
354 } else {
355 dout("get_osd %p FAIL\n", osd);
356 return NULL;
357 }
358}
359
360static void put_osd(struct ceph_osd *osd)
361{
362 dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
363 atomic_read(&osd->o_ref) - 1);
364 if (atomic_dec_and_test(&osd->o_ref))
365 kfree(osd);
366}
367
368/*
369 * remove an osd from our map
370 */
371static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
372{
373 dout("__remove_osd %p\n", osd);
374 BUG_ON(!list_empty(&osd->o_requests));
375 rb_erase(&osd->o_node, &osdc->osds);
376 list_del_init(&osd->o_osd_lru);
377 ceph_con_close(&osd->o_con);
378 put_osd(osd);
379}
380
381static void __move_osd_to_lru(struct ceph_osd_client *osdc,
382 struct ceph_osd *osd)
383{
384 dout("__move_osd_to_lru %p\n", osd);
385 BUG_ON(!list_empty(&osd->o_osd_lru));
386 list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
387 osd->lru_ttl = jiffies + osdc->client->mount_args->osd_idle_ttl * HZ;
388}
389
390static void __remove_osd_from_lru(struct ceph_osd *osd)
391{
392 dout("__remove_osd_from_lru %p\n", osd);
393 if (!list_empty(&osd->o_osd_lru))
394 list_del_init(&osd->o_osd_lru);
395}
396
397static void remove_old_osds(struct ceph_osd_client *osdc, int remove_all)
398{
399 struct ceph_osd *osd, *nosd;
400
401 dout("__remove_old_osds %p\n", osdc);
402 mutex_lock(&osdc->request_mutex);
403 list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
404 if (!remove_all && time_before(jiffies, osd->lru_ttl))
405 break;
406 __remove_osd(osdc, osd);
407 }
408 mutex_unlock(&osdc->request_mutex);
409}
410
411/*
412 * reset osd connect
413 */
414static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
415{
416 int ret = 0;
417
418 dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
419 if (list_empty(&osd->o_requests)) {
420 __remove_osd(osdc, osd);
421 } else {
422 ceph_con_close(&osd->o_con);
423 ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]);
424 osd->o_incarnation++;
425 }
426 return ret;
427}
428
429static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
430{
431 struct rb_node **p = &osdc->osds.rb_node;
432 struct rb_node *parent = NULL;
433 struct ceph_osd *osd = NULL;
434
435 while (*p) {
436 parent = *p;
437 osd = rb_entry(parent, struct ceph_osd, o_node);
438 if (new->o_osd < osd->o_osd)
439 p = &(*p)->rb_left;
440 else if (new->o_osd > osd->o_osd)
441 p = &(*p)->rb_right;
442 else
443 BUG();
444 }
445
446 rb_link_node(&new->o_node, parent, p);
447 rb_insert_color(&new->o_node, &osdc->osds);
448}
449
450static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
451{
452 struct ceph_osd *osd;
453 struct rb_node *n = osdc->osds.rb_node;
454
455 while (n) {
456 osd = rb_entry(n, struct ceph_osd, o_node);
457 if (o < osd->o_osd)
458 n = n->rb_left;
459 else if (o > osd->o_osd)
460 n = n->rb_right;
461 else
462 return osd;
463 }
464 return NULL;
465}
466
467static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
468{
469 schedule_delayed_work(&osdc->timeout_work,
470 osdc->client->mount_args->osd_keepalive_timeout * HZ);
471}
472
473static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
474{
475 cancel_delayed_work(&osdc->timeout_work);
476}
477
478/*
479 * Register request, assign tid. If this is the first request, set up
480 * the timeout event.
481 */
482static void register_request(struct ceph_osd_client *osdc,
483 struct ceph_osd_request *req)
484{
485 mutex_lock(&osdc->request_mutex);
486 req->r_tid = ++osdc->last_tid;
487 req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
488 INIT_LIST_HEAD(&req->r_req_lru_item);
489
490 dout("register_request %p tid %lld\n", req, req->r_tid);
491 __insert_request(osdc, req);
492 ceph_osdc_get_request(req);
493 osdc->num_requests++;
494
495 if (osdc->num_requests == 1) {
496 dout(" first request, scheduling timeout\n");
497 __schedule_osd_timeout(osdc);
498 }
499 mutex_unlock(&osdc->request_mutex);
500}
501
502/*
503 * called under osdc->request_mutex
504 */
505static void __unregister_request(struct ceph_osd_client *osdc,
506 struct ceph_osd_request *req)
507{
508 dout("__unregister_request %p tid %lld\n", req, req->r_tid);
509 rb_erase(&req->r_node, &osdc->requests);
510 osdc->num_requests--;
511
512 if (req->r_osd) {
513 /* make sure the original request isn't in flight. */
514 ceph_con_revoke(&req->r_osd->o_con, req->r_request);
515
516 list_del_init(&req->r_osd_item);
517 if (list_empty(&req->r_osd->o_requests))
518 __move_osd_to_lru(osdc, req->r_osd);
519 req->r_osd = NULL;
520 }
521
522 ceph_osdc_put_request(req);
523
524 list_del_init(&req->r_req_lru_item);
525 if (osdc->num_requests == 0) {
526 dout(" no requests, canceling timeout\n");
527 __cancel_osd_timeout(osdc);
528 }
529}
530
531/*
532 * Cancel a previously queued request message
533 */
534static void __cancel_request(struct ceph_osd_request *req)
535{
536 if (req->r_sent) {
537 ceph_con_revoke(&req->r_osd->o_con, req->r_request);
538 req->r_sent = 0;
539 }
540 list_del_init(&req->r_req_lru_item);
541}
542
543/*
544 * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
545 * (as needed), and set the request r_osd appropriately. If there is
546 * no up osd, set r_osd to NULL.
547 *
548 * Return 0 if unchanged, 1 if changed, or negative on error.
549 *
550 * Caller should hold map_sem for read and request_mutex.
551 */
552static int __map_osds(struct ceph_osd_client *osdc,
553 struct ceph_osd_request *req)
554{
555 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
556 struct ceph_pg pgid;
557 int o = -1;
558 int err;
559
560 dout("map_osds %p tid %lld\n", req, req->r_tid);
561 err = ceph_calc_object_layout(&reqhead->layout, req->r_oid,
562 &req->r_file_layout, osdc->osdmap);
563 if (err)
564 return err;
565 pgid = reqhead->layout.ol_pgid;
566 req->r_pgid = pgid;
567
568 o = ceph_calc_pg_primary(osdc->osdmap, pgid);
569
570 if ((req->r_osd && req->r_osd->o_osd == o &&
571 req->r_sent >= req->r_osd->o_incarnation) ||
572 (req->r_osd == NULL && o == -1))
573 return 0; /* no change */
574
575 dout("map_osds tid %llu pgid %d.%x osd%d (was osd%d)\n",
576 req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o,
577 req->r_osd ? req->r_osd->o_osd : -1);
578
579 if (req->r_osd) {
580 __cancel_request(req);
581 list_del_init(&req->r_osd_item);
582 req->r_osd = NULL;
583 }
584
585 req->r_osd = __lookup_osd(osdc, o);
586 if (!req->r_osd && o >= 0) {
587 err = -ENOMEM;
588 req->r_osd = create_osd(osdc);
589 if (!req->r_osd)
590 goto out;
591
592 dout("map_osds osd %p is osd%d\n", req->r_osd, o);
593 req->r_osd->o_osd = o;
594 req->r_osd->o_con.peer_name.num = cpu_to_le64(o);
595 __insert_osd(osdc, req->r_osd);
596
597 ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]);
598 }
599
600 if (req->r_osd) {
601 __remove_osd_from_lru(req->r_osd);
602 list_add(&req->r_osd_item, &req->r_osd->o_requests);
603 }
604 err = 1; /* osd changed */
605
606out:
607 return err;
608}
609
610/*
611 * caller should hold map_sem (for read) and request_mutex
612 */
613static int __send_request(struct ceph_osd_client *osdc,
614 struct ceph_osd_request *req)
615{
616 struct ceph_osd_request_head *reqhead;
617 int err;
618
619 err = __map_osds(osdc, req);
620 if (err < 0)
621 return err;
622 if (req->r_osd == NULL) {
623 dout("send_request %p no up osds in pg\n", req);
624 ceph_monc_request_next_osdmap(&osdc->client->monc);
625 return 0;
626 }
627
628 dout("send_request %p tid %llu to osd%d flags %d\n",
629 req, req->r_tid, req->r_osd->o_osd, req->r_flags);
630
631 reqhead = req->r_request->front.iov_base;
632 reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch);
633 reqhead->flags |= cpu_to_le32(req->r_flags); /* e.g., RETRY */
634 reqhead->reassert_version = req->r_reassert_version;
635
636 req->r_sent_stamp = jiffies;
637 list_move_tail(&osdc->req_lru, &req->r_req_lru_item);
638
639 ceph_msg_get(req->r_request); /* send consumes a ref */
640 ceph_con_send(&req->r_osd->o_con, req->r_request);
641 req->r_sent = req->r_osd->o_incarnation;
642 return 0;
643}
644
645/*
646 * Timeout callback, called every N seconds when 1 or more osd
647 * requests has been active for more than N seconds. When this
648 * happens, we ping all OSDs with requests who have timed out to
649 * ensure any communications channel reset is detected. Reset the
650 * request timeouts another N seconds in the future as we go.
651 * Reschedule the timeout event another N seconds in future (unless
652 * there are no open requests).
653 */
654static void handle_timeout(struct work_struct *work)
655{
656 struct ceph_osd_client *osdc =
657 container_of(work, struct ceph_osd_client, timeout_work.work);
658 struct ceph_osd_request *req, *last_req = NULL;
659 struct ceph_osd *osd;
660 unsigned long timeout = osdc->client->mount_args->osd_timeout * HZ;
661 unsigned long keepalive =
662 osdc->client->mount_args->osd_keepalive_timeout * HZ;
663 unsigned long last_sent = 0;
664 struct rb_node *p;
665 struct list_head slow_osds;
666
667 dout("timeout\n");
668 down_read(&osdc->map_sem);
669
670 ceph_monc_request_next_osdmap(&osdc->client->monc);
671
672 mutex_lock(&osdc->request_mutex);
673 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
674 req = rb_entry(p, struct ceph_osd_request, r_node);
675
676 if (req->r_resend) {
677 int err;
678
679 dout("osdc resending prev failed %lld\n", req->r_tid);
680 err = __send_request(osdc, req);
681 if (err)
682 dout("osdc failed again on %lld\n", req->r_tid);
683 else
684 req->r_resend = false;
685 continue;
686 }
687 }
688
689 /*
690 * reset osds that appear to be _really_ unresponsive. this
691 * is a failsafe measure.. we really shouldn't be getting to
692 * this point if the system is working properly. the monitors
693 * should mark the osd as failed and we should find out about
694 * it from an updated osd map.
695 */
696 while (!list_empty(&osdc->req_lru)) {
697 req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
698 r_req_lru_item);
699
700 if (time_before(jiffies, req->r_sent_stamp + timeout))
701 break;
702
703 BUG_ON(req == last_req && req->r_sent_stamp == last_sent);
704 last_req = req;
705 last_sent = req->r_sent_stamp;
706
707 osd = req->r_osd;
708 BUG_ON(!osd);
709 pr_warning(" tid %llu timed out on osd%d, will reset osd\n",
710 req->r_tid, osd->o_osd);
711 __kick_requests(osdc, osd);
712 }
713
714 /*
715 * ping osds that are a bit slow. this ensures that if there
716 * is a break in the TCP connection we will notice, and reopen
717 * a connection with that osd (from the fault callback).
718 */
719 INIT_LIST_HEAD(&slow_osds);
720 list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
721 if (time_before(jiffies, req->r_sent_stamp + keepalive))
722 break;
723
724 osd = req->r_osd;
725 BUG_ON(!osd);
726 dout(" tid %llu is slow, will send keepalive on osd%d\n",
727 req->r_tid, osd->o_osd);
728 list_move_tail(&osd->o_keepalive_item, &slow_osds);
729 }
730 while (!list_empty(&slow_osds)) {
731 osd = list_entry(slow_osds.next, struct ceph_osd,
732 o_keepalive_item);
733 list_del_init(&osd->o_keepalive_item);
734 ceph_con_keepalive(&osd->o_con);
735 }
736
737 __schedule_osd_timeout(osdc);
738 mutex_unlock(&osdc->request_mutex);
739
740 up_read(&osdc->map_sem);
741}
742
743static void handle_osds_timeout(struct work_struct *work)
744{
745 struct ceph_osd_client *osdc =
746 container_of(work, struct ceph_osd_client,
747 osds_timeout_work.work);
748 unsigned long delay =
749 osdc->client->mount_args->osd_idle_ttl * HZ >> 2;
750
751 dout("osds timeout\n");
752 down_read(&osdc->map_sem);
753 remove_old_osds(osdc, 0);
754 up_read(&osdc->map_sem);
755
756 schedule_delayed_work(&osdc->osds_timeout_work,
757 round_jiffies_relative(delay));
758}
759
760/*
761 * handle osd op reply. either call the callback if it is specified,
762 * or do the completion to wake up the waiting thread.
763 */
764static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
765 struct ceph_connection *con)
766{
767 struct ceph_osd_reply_head *rhead = msg->front.iov_base;
768 struct ceph_osd_request *req;
769 u64 tid;
770 int numops, object_len, flags;
771
772 tid = le64_to_cpu(msg->hdr.tid);
773 if (msg->front.iov_len < sizeof(*rhead))
774 goto bad;
775 numops = le32_to_cpu(rhead->num_ops);
776 object_len = le32_to_cpu(rhead->object_len);
777 if (msg->front.iov_len != sizeof(*rhead) + object_len +
778 numops * sizeof(struct ceph_osd_op))
779 goto bad;
780 dout("handle_reply %p tid %llu\n", msg, tid);
781
782 /* lookup */
783 mutex_lock(&osdc->request_mutex);
784 req = __lookup_request(osdc, tid);
785 if (req == NULL) {
786 dout("handle_reply tid %llu dne\n", tid);
787 mutex_unlock(&osdc->request_mutex);
788 return;
789 }
790 ceph_osdc_get_request(req);
791 flags = le32_to_cpu(rhead->flags);
792
793 /*
794 * if this connection filled our message, drop our reference now, to
795 * avoid a (safe but slower) revoke later.
796 */
797 if (req->r_con_filling_msg == con && req->r_reply == msg) {
798 dout(" dropping con_filling_msg ref %p\n", con);
799 req->r_con_filling_msg = NULL;
800 ceph_con_put(con);
801 }
802
803 if (!req->r_got_reply) {
804 unsigned bytes;
805
806 req->r_result = le32_to_cpu(rhead->result);
807 bytes = le32_to_cpu(msg->hdr.data_len);
808 dout("handle_reply result %d bytes %d\n", req->r_result,
809 bytes);
810 if (req->r_result == 0)
811 req->r_result = bytes;
812
813 /* in case this is a write and we need to replay, */
814 req->r_reassert_version = rhead->reassert_version;
815
816 req->r_got_reply = 1;
817 } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
818 dout("handle_reply tid %llu dup ack\n", tid);
819 mutex_unlock(&osdc->request_mutex);
820 goto done;
821 }
822
823 dout("handle_reply tid %llu flags %d\n", tid, flags);
824
825 /* either this is a read, or we got the safe response */
826 if ((flags & CEPH_OSD_FLAG_ONDISK) ||
827 ((flags & CEPH_OSD_FLAG_WRITE) == 0))
828 __unregister_request(osdc, req);
829
830 mutex_unlock(&osdc->request_mutex);
831
832 if (req->r_callback)
833 req->r_callback(req, msg);
834 else
835 complete(&req->r_completion);
836
837 if (flags & CEPH_OSD_FLAG_ONDISK) {
838 if (req->r_safe_callback)
839 req->r_safe_callback(req, msg);
840 complete(&req->r_safe_completion); /* fsync waiter */
841 }
842
843done:
844 ceph_osdc_put_request(req);
845 return;
846
847bad:
848 pr_err("corrupt osd_op_reply got %d %d expected %d\n",
849 (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len),
850 (int)sizeof(*rhead));
851 ceph_msg_dump(msg);
852}
853
854
855static int __kick_requests(struct ceph_osd_client *osdc,
856 struct ceph_osd *kickosd)
857{
858 struct ceph_osd_request *req;
859 struct rb_node *p, *n;
860 int needmap = 0;
861 int err;
862
863 dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1);
864 if (kickosd) {
865 __reset_osd(osdc, kickosd);
866 } else {
867 for (p = rb_first(&osdc->osds); p; p = n) {
868 struct ceph_osd *osd =
869 rb_entry(p, struct ceph_osd, o_node);
870
871 n = rb_next(p);
872 if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
873 memcmp(&osd->o_con.peer_addr,
874 ceph_osd_addr(osdc->osdmap,
875 osd->o_osd),
876 sizeof(struct ceph_entity_addr)) != 0)
877 __reset_osd(osdc, osd);
878 }
879 }
880
881 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
882 req = rb_entry(p, struct ceph_osd_request, r_node);
883
884 if (req->r_resend) {
885 dout(" r_resend set on tid %llu\n", req->r_tid);
886 __cancel_request(req);
887 goto kick;
888 }
889 if (req->r_osd && kickosd == req->r_osd) {
890 __cancel_request(req);
891 goto kick;
892 }
893
894 err = __map_osds(osdc, req);
895 if (err == 0)
896 continue; /* no change */
897 if (err < 0) {
898 /*
899 * FIXME: really, we should set the request
900 * error and fail if this isn't a 'nofail'
901 * request, but that's a fair bit more
902 * complicated to do. So retry!
903 */
904 dout(" setting r_resend on %llu\n", req->r_tid);
905 req->r_resend = true;
906 continue;
907 }
908 if (req->r_osd == NULL) {
909 dout("tid %llu maps to no valid osd\n", req->r_tid);
910 needmap++; /* request a newer map */
911 continue;
912 }
913
914kick:
915 dout("kicking %p tid %llu osd%d\n", req, req->r_tid,
916 req->r_osd->o_osd);
917 req->r_flags |= CEPH_OSD_FLAG_RETRY;
918 err = __send_request(osdc, req);
919 if (err) {
920 dout(" setting r_resend on %llu\n", req->r_tid);
921 req->r_resend = true;
922 }
923 }
924
925 return needmap;
926}
927
928/*
929 * Resubmit osd requests whose osd or osd address has changed. Request
930 * a new osd map if osds are down, or we are otherwise unable to determine
931 * how to direct a request.
932 *
933 * Close connections to down osds.
934 *
935 * If @who is specified, resubmit requests for that specific osd.
936 *
937 * Caller should hold map_sem for read and request_mutex.
938 */
939static void kick_requests(struct ceph_osd_client *osdc,
940 struct ceph_osd *kickosd)
941{
942 int needmap;
943
944 mutex_lock(&osdc->request_mutex);
945 needmap = __kick_requests(osdc, kickosd);
946 mutex_unlock(&osdc->request_mutex);
947
948 if (needmap) {
949 dout("%d requests for down osds, need new map\n", needmap);
950 ceph_monc_request_next_osdmap(&osdc->client->monc);
951 }
952
953}
954/*
955 * Process updated osd map.
956 *
957 * The message contains any number of incremental and full maps, normally
958 * indicating some sort of topology change in the cluster. Kick requests
959 * off to different OSDs as needed.
960 */
961void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
962{
963 void *p, *end, *next;
964 u32 nr_maps, maplen;
965 u32 epoch;
966 struct ceph_osdmap *newmap = NULL, *oldmap;
967 int err;
968 struct ceph_fsid fsid;
969
970 dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
971 p = msg->front.iov_base;
972 end = p + msg->front.iov_len;
973
974 /* verify fsid */
975 ceph_decode_need(&p, end, sizeof(fsid), bad);
976 ceph_decode_copy(&p, &fsid, sizeof(fsid));
977 if (ceph_check_fsid(osdc->client, &fsid) < 0)
978 return;
979
980 down_write(&osdc->map_sem);
981
982 /* incremental maps */
983 ceph_decode_32_safe(&p, end, nr_maps, bad);
984 dout(" %d inc maps\n", nr_maps);
985 while (nr_maps > 0) {
986 ceph_decode_need(&p, end, 2*sizeof(u32), bad);
987 epoch = ceph_decode_32(&p);
988 maplen = ceph_decode_32(&p);
989 ceph_decode_need(&p, end, maplen, bad);
990 next = p + maplen;
991 if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
992 dout("applying incremental map %u len %d\n",
993 epoch, maplen);
994 newmap = osdmap_apply_incremental(&p, next,
995 osdc->osdmap,
996 osdc->client->msgr);
997 if (IS_ERR(newmap)) {
998 err = PTR_ERR(newmap);
999 goto bad;
1000 }
1001 BUG_ON(!newmap);
1002 if (newmap != osdc->osdmap) {
1003 ceph_osdmap_destroy(osdc->osdmap);
1004 osdc->osdmap = newmap;
1005 }
1006 } else {
1007 dout("ignoring incremental map %u len %d\n",
1008 epoch, maplen);
1009 }
1010 p = next;
1011 nr_maps--;
1012 }
1013 if (newmap)
1014 goto done;
1015
1016 /* full maps */
1017 ceph_decode_32_safe(&p, end, nr_maps, bad);
1018 dout(" %d full maps\n", nr_maps);
1019 while (nr_maps) {
1020 ceph_decode_need(&p, end, 2*sizeof(u32), bad);
1021 epoch = ceph_decode_32(&p);
1022 maplen = ceph_decode_32(&p);
1023 ceph_decode_need(&p, end, maplen, bad);
1024 if (nr_maps > 1) {
1025 dout("skipping non-latest full map %u len %d\n",
1026 epoch, maplen);
1027 } else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
1028 dout("skipping full map %u len %d, "
1029 "older than our %u\n", epoch, maplen,
1030 osdc->osdmap->epoch);
1031 } else {
1032 dout("taking full map %u len %d\n", epoch, maplen);
1033 newmap = osdmap_decode(&p, p+maplen);
1034 if (IS_ERR(newmap)) {
1035 err = PTR_ERR(newmap);
1036 goto bad;
1037 }
1038 BUG_ON(!newmap);
1039 oldmap = osdc->osdmap;
1040 osdc->osdmap = newmap;
1041 if (oldmap)
1042 ceph_osdmap_destroy(oldmap);
1043 }
1044 p += maplen;
1045 nr_maps--;
1046 }
1047
1048done:
1049 downgrade_write(&osdc->map_sem);
1050 ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch);
1051 if (newmap)
1052 kick_requests(osdc, NULL);
1053 up_read(&osdc->map_sem);
1054 return;
1055
1056bad:
1057 pr_err("osdc handle_map corrupt msg\n");
1058 ceph_msg_dump(msg);
1059 up_write(&osdc->map_sem);
1060 return;
1061}
1062
1063
1064/*
1065 * A read request prepares specific pages that data is to be read into.
1066 * When a message is being read off the wire, we call prepare_pages to
1067 * find those pages.
1068 * 0 = success, -1 failure.
1069 */
1070static int __prepare_pages(struct ceph_connection *con,
1071 struct ceph_msg_header *hdr,
1072 struct ceph_osd_request *req,
1073 u64 tid,
1074 struct ceph_msg *m)
1075{
1076 struct ceph_osd *osd = con->private;
1077 struct ceph_osd_client *osdc;
1078 int ret = -1;
1079 int data_len = le32_to_cpu(hdr->data_len);
1080 unsigned data_off = le16_to_cpu(hdr->data_off);
1081
1082 int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
1083
1084 if (!osd)
1085 return -1;
1086
1087 osdc = osd->o_osdc;
1088
1089 dout("__prepare_pages on msg %p tid %llu, has %d pages, want %d\n", m,
1090 tid, req->r_num_pages, want);
1091 if (unlikely(req->r_num_pages < want))
1092 goto out;
1093 m->pages = req->r_pages;
1094 m->nr_pages = req->r_num_pages;
1095 ret = 0; /* success */
1096out:
1097 BUG_ON(ret < 0 || m->nr_pages < want);
1098
1099 return ret;
1100}
1101
1102/*
1103 * Register request, send initial attempt.
1104 */
1105int ceph_osdc_start_request(struct ceph_osd_client *osdc,
1106 struct ceph_osd_request *req,
1107 bool nofail)
1108{
1109 int rc = 0;
1110
1111 req->r_request->pages = req->r_pages;
1112 req->r_request->nr_pages = req->r_num_pages;
1113
1114 register_request(osdc, req);
1115
1116 down_read(&osdc->map_sem);
1117 mutex_lock(&osdc->request_mutex);
1118 /*
1119 * a racing kick_requests() may have sent the message for us
1120 * while we dropped request_mutex above, so only send now if
1121 * the request still han't been touched yet.
1122 */
1123 if (req->r_sent == 0) {
1124 rc = __send_request(osdc, req);
1125 if (rc) {
1126 if (nofail) {
1127 dout("osdc_start_request failed send, "
1128 " marking %lld\n", req->r_tid);
1129 req->r_resend = true;
1130 rc = 0;
1131 } else {
1132 __unregister_request(osdc, req);
1133 }
1134 }
1135 }
1136 mutex_unlock(&osdc->request_mutex);
1137 up_read(&osdc->map_sem);
1138 return rc;
1139}
1140
1141/*
1142 * wait for a request to complete
1143 */
1144int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
1145 struct ceph_osd_request *req)
1146{
1147 int rc;
1148
1149 rc = wait_for_completion_interruptible(&req->r_completion);
1150 if (rc < 0) {
1151 mutex_lock(&osdc->request_mutex);
1152 __cancel_request(req);
1153 __unregister_request(osdc, req);
1154 mutex_unlock(&osdc->request_mutex);
1155 dout("wait_request tid %llu canceled/timed out\n", req->r_tid);
1156 return rc;
1157 }
1158
1159 dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result);
1160 return req->r_result;
1161}
1162
1163/*
1164 * sync - wait for all in-flight requests to flush. avoid starvation.
1165 */
1166void ceph_osdc_sync(struct ceph_osd_client *osdc)
1167{
1168 struct ceph_osd_request *req;
1169 u64 last_tid, next_tid = 0;
1170
1171 mutex_lock(&osdc->request_mutex);
1172 last_tid = osdc->last_tid;
1173 while (1) {
1174 req = __lookup_request_ge(osdc, next_tid);
1175 if (!req)
1176 break;
1177 if (req->r_tid > last_tid)
1178 break;
1179
1180 next_tid = req->r_tid + 1;
1181 if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
1182 continue;
1183
1184 ceph_osdc_get_request(req);
1185 mutex_unlock(&osdc->request_mutex);
1186 dout("sync waiting on tid %llu (last is %llu)\n",
1187 req->r_tid, last_tid);
1188 wait_for_completion(&req->r_safe_completion);
1189 mutex_lock(&osdc->request_mutex);
1190 ceph_osdc_put_request(req);
1191 }
1192 mutex_unlock(&osdc->request_mutex);
1193 dout("sync done (thru tid %llu)\n", last_tid);
1194}
1195
1196/*
1197 * init, shutdown
1198 */
1199int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
1200{
1201 int err;
1202
1203 dout("init\n");
1204 osdc->client = client;
1205 osdc->osdmap = NULL;
1206 init_rwsem(&osdc->map_sem);
1207 init_completion(&osdc->map_waiters);
1208 osdc->last_requested_map = 0;
1209 mutex_init(&osdc->request_mutex);
1210 osdc->last_tid = 0;
1211 osdc->osds = RB_ROOT;
1212 INIT_LIST_HEAD(&osdc->osd_lru);
1213 osdc->requests = RB_ROOT;
1214 INIT_LIST_HEAD(&osdc->req_lru);
1215 osdc->num_requests = 0;
1216 INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
1217 INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
1218
1219 schedule_delayed_work(&osdc->osds_timeout_work,
1220 round_jiffies_relative(osdc->client->mount_args->osd_idle_ttl * HZ));
1221
1222 err = -ENOMEM;
1223 osdc->req_mempool = mempool_create_kmalloc_pool(10,
1224 sizeof(struct ceph_osd_request));
1225 if (!osdc->req_mempool)
1226 goto out;
1227
1228 err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true);
1229 if (err < 0)
1230 goto out_mempool;
1231 err = ceph_msgpool_init(&osdc->msgpool_op_reply,
1232 OSD_OPREPLY_FRONT_LEN, 10, true);
1233 if (err < 0)
1234 goto out_msgpool;
1235 return 0;
1236
1237out_msgpool:
1238 ceph_msgpool_destroy(&osdc->msgpool_op);
1239out_mempool:
1240 mempool_destroy(osdc->req_mempool);
1241out:
1242 return err;
1243}
1244
1245void ceph_osdc_stop(struct ceph_osd_client *osdc)
1246{
1247 cancel_delayed_work_sync(&osdc->timeout_work);
1248 cancel_delayed_work_sync(&osdc->osds_timeout_work);
1249 if (osdc->osdmap) {
1250 ceph_osdmap_destroy(osdc->osdmap);
1251 osdc->osdmap = NULL;
1252 }
1253 remove_old_osds(osdc, 1);
1254 mempool_destroy(osdc->req_mempool);
1255 ceph_msgpool_destroy(&osdc->msgpool_op);
1256 ceph_msgpool_destroy(&osdc->msgpool_op_reply);
1257}
1258
1259/*
1260 * Read some contiguous pages. If we cross a stripe boundary, shorten
1261 * *plen. Return number of bytes read, or error.
1262 */
1263int ceph_osdc_readpages(struct ceph_osd_client *osdc,
1264 struct ceph_vino vino, struct ceph_file_layout *layout,
1265 u64 off, u64 *plen,
1266 u32 truncate_seq, u64 truncate_size,
1267 struct page **pages, int num_pages)
1268{
1269 struct ceph_osd_request *req;
1270 int rc = 0;
1271
1272 dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
1273 vino.snap, off, *plen);
1274 req = ceph_osdc_new_request(osdc, layout, vino, off, plen,
1275 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
1276 NULL, 0, truncate_seq, truncate_size, NULL,
1277 false, 1);
1278 if (IS_ERR(req))
1279 return PTR_ERR(req);
1280
1281 /* it may be a short read due to an object boundary */
1282 req->r_pages = pages;
1283 num_pages = calc_pages_for(off, *plen);
1284 req->r_num_pages = num_pages;
1285
1286 dout("readpages final extent is %llu~%llu (%d pages)\n",
1287 off, *plen, req->r_num_pages);
1288
1289 rc = ceph_osdc_start_request(osdc, req, false);
1290 if (!rc)
1291 rc = ceph_osdc_wait_request(osdc, req);
1292
1293 ceph_osdc_put_request(req);
1294 dout("readpages result %d\n", rc);
1295 return rc;
1296}
1297
1298/*
1299 * do a synchronous write on N pages
1300 */
1301int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
1302 struct ceph_file_layout *layout,
1303 struct ceph_snap_context *snapc,
1304 u64 off, u64 len,
1305 u32 truncate_seq, u64 truncate_size,
1306 struct timespec *mtime,
1307 struct page **pages, int num_pages,
1308 int flags, int do_sync, bool nofail)
1309{
1310 struct ceph_osd_request *req;
1311 int rc = 0;
1312
1313 BUG_ON(vino.snap != CEPH_NOSNAP);
1314 req = ceph_osdc_new_request(osdc, layout, vino, off, &len,
1315 CEPH_OSD_OP_WRITE,
1316 flags | CEPH_OSD_FLAG_ONDISK |
1317 CEPH_OSD_FLAG_WRITE,
1318 snapc, do_sync,
1319 truncate_seq, truncate_size, mtime,
1320 nofail, 1);
1321 if (IS_ERR(req))
1322 return PTR_ERR(req);
1323
1324 /* it may be a short write due to an object boundary */
1325 req->r_pages = pages;
1326 req->r_num_pages = calc_pages_for(off, len);
1327 dout("writepages %llu~%llu (%d pages)\n", off, len,
1328 req->r_num_pages);
1329
1330 rc = ceph_osdc_start_request(osdc, req, nofail);
1331 if (!rc)
1332 rc = ceph_osdc_wait_request(osdc, req);
1333
1334 ceph_osdc_put_request(req);
1335 if (rc == 0)
1336 rc = len;
1337 dout("writepages result %d\n", rc);
1338 return rc;
1339}
1340
1341/*
1342 * handle incoming message
1343 */
1344static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
1345{
1346 struct ceph_osd *osd = con->private;
1347 struct ceph_osd_client *osdc;
1348 int type = le16_to_cpu(msg->hdr.type);
1349
1350 if (!osd)
1351 return;
1352 osdc = osd->o_osdc;
1353
1354 switch (type) {
1355 case CEPH_MSG_OSD_MAP:
1356 ceph_osdc_handle_map(osdc, msg);
1357 break;
1358 case CEPH_MSG_OSD_OPREPLY:
1359 handle_reply(osdc, msg, con);
1360 break;
1361
1362 default:
1363 pr_err("received unknown message type %d %s\n", type,
1364 ceph_msg_type_name(type));
1365 }
1366 ceph_msg_put(msg);
1367}
1368
1369/*
1370 * lookup and return message for incoming reply
1371 */
1372static struct ceph_msg *get_reply(struct ceph_connection *con,
1373 struct ceph_msg_header *hdr,
1374 int *skip)
1375{
1376 struct ceph_osd *osd = con->private;
1377 struct ceph_osd_client *osdc = osd->o_osdc;
1378 struct ceph_msg *m;
1379 struct ceph_osd_request *req;
1380 int front = le32_to_cpu(hdr->front_len);
1381 int data_len = le32_to_cpu(hdr->data_len);
1382 u64 tid;
1383 int err;
1384
1385 tid = le64_to_cpu(hdr->tid);
1386 mutex_lock(&osdc->request_mutex);
1387 req = __lookup_request(osdc, tid);
1388 if (!req) {
1389 *skip = 1;
1390 m = NULL;
1391 pr_info("get_reply unknown tid %llu from osd%d\n", tid,
1392 osd->o_osd);
1393 goto out;
1394 }
1395
1396 if (req->r_con_filling_msg) {
1397 dout("get_reply revoking msg %p from old con %p\n",
1398 req->r_reply, req->r_con_filling_msg);
1399 ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
1400 ceph_con_put(req->r_con_filling_msg);
1401 }
1402
1403 if (front > req->r_reply->front.iov_len) {
1404 pr_warning("get_reply front %d > preallocated %d\n",
1405 front, (int)req->r_reply->front.iov_len);
1406 m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, 0, 0, NULL);
1407 if (IS_ERR(m))
1408 goto out;
1409 ceph_msg_put(req->r_reply);
1410 req->r_reply = m;
1411 }
1412 m = ceph_msg_get(req->r_reply);
1413
1414 if (data_len > 0) {
1415 err = __prepare_pages(con, hdr, req, tid, m);
1416 if (err < 0) {
1417 *skip = 1;
1418 ceph_msg_put(m);
1419 m = ERR_PTR(err);
1420 }
1421 }
1422 *skip = 0;
1423 req->r_con_filling_msg = ceph_con_get(con);
1424 dout("get_reply tid %lld %p\n", tid, m);
1425
1426out:
1427 mutex_unlock(&osdc->request_mutex);
1428 return m;
1429
1430}
1431
1432static struct ceph_msg *alloc_msg(struct ceph_connection *con,
1433 struct ceph_msg_header *hdr,
1434 int *skip)
1435{
1436 struct ceph_osd *osd = con->private;
1437 int type = le16_to_cpu(hdr->type);
1438 int front = le32_to_cpu(hdr->front_len);
1439
1440 switch (type) {
1441 case CEPH_MSG_OSD_MAP:
1442 return ceph_msg_new(type, front, 0, 0, NULL);
1443 case CEPH_MSG_OSD_OPREPLY:
1444 return get_reply(con, hdr, skip);
1445 default:
1446 pr_info("alloc_msg unexpected msg type %d from osd%d\n", type,
1447 osd->o_osd);
1448 *skip = 1;
1449 return NULL;
1450 }
1451}
1452
1453/*
1454 * Wrappers to refcount containing ceph_osd struct
1455 */
1456static struct ceph_connection *get_osd_con(struct ceph_connection *con)
1457{
1458 struct ceph_osd *osd = con->private;
1459 if (get_osd(osd))
1460 return con;
1461 return NULL;
1462}
1463
1464static void put_osd_con(struct ceph_connection *con)
1465{
1466 struct ceph_osd *osd = con->private;
1467 put_osd(osd);
1468}
1469
1470/*
1471 * authentication
1472 */
1473static int get_authorizer(struct ceph_connection *con,
1474 void **buf, int *len, int *proto,
1475 void **reply_buf, int *reply_len, int force_new)
1476{
1477 struct ceph_osd *o = con->private;
1478 struct ceph_osd_client *osdc = o->o_osdc;
1479 struct ceph_auth_client *ac = osdc->client->monc.auth;
1480 int ret = 0;
1481
1482 if (force_new && o->o_authorizer) {
1483 ac->ops->destroy_authorizer(ac, o->o_authorizer);
1484 o->o_authorizer = NULL;
1485 }
1486 if (o->o_authorizer == NULL) {
1487 ret = ac->ops->create_authorizer(
1488 ac, CEPH_ENTITY_TYPE_OSD,
1489 &o->o_authorizer,
1490 &o->o_authorizer_buf,
1491 &o->o_authorizer_buf_len,
1492 &o->o_authorizer_reply_buf,
1493 &o->o_authorizer_reply_buf_len);
1494 if (ret)
1495 return ret;
1496 }
1497
1498 *proto = ac->protocol;
1499 *buf = o->o_authorizer_buf;
1500 *len = o->o_authorizer_buf_len;
1501 *reply_buf = o->o_authorizer_reply_buf;
1502 *reply_len = o->o_authorizer_reply_buf_len;
1503 return 0;
1504}
1505
1506
1507static int verify_authorizer_reply(struct ceph_connection *con, int len)
1508{
1509 struct ceph_osd *o = con->private;
1510 struct ceph_osd_client *osdc = o->o_osdc;
1511 struct ceph_auth_client *ac = osdc->client->monc.auth;
1512
1513 return ac->ops->verify_authorizer_reply(ac, o->o_authorizer, len);
1514}
1515
1516static int invalidate_authorizer(struct ceph_connection *con)
1517{
1518 struct ceph_osd *o = con->private;
1519 struct ceph_osd_client *osdc = o->o_osdc;
1520 struct ceph_auth_client *ac = osdc->client->monc.auth;
1521
1522 if (ac->ops->invalidate_authorizer)
1523 ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
1524
1525 return ceph_monc_validate_auth(&osdc->client->monc);
1526}
1527
1528const static struct ceph_connection_operations osd_con_ops = {
1529 .get = get_osd_con,
1530 .put = put_osd_con,
1531 .dispatch = dispatch,
1532 .get_authorizer = get_authorizer,
1533 .verify_authorizer_reply = verify_authorizer_reply,
1534 .invalidate_authorizer = invalidate_authorizer,
1535 .alloc_msg = alloc_msg,
1536 .fault = osd_reset,
1537};
diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h
new file mode 100644
index 000000000000..1b1a3ca43afc
--- /dev/null
+++ b/fs/ceph/osd_client.h
@@ -0,0 +1,166 @@
1#ifndef _FS_CEPH_OSD_CLIENT_H
2#define _FS_CEPH_OSD_CLIENT_H
3
4#include <linux/completion.h>
5#include <linux/kref.h>
6#include <linux/mempool.h>
7#include <linux/rbtree.h>
8
9#include "types.h"
10#include "osdmap.h"
11#include "messenger.h"
12
13struct ceph_msg;
14struct ceph_snap_context;
15struct ceph_osd_request;
16struct ceph_osd_client;
17struct ceph_authorizer;
18
19/*
20 * completion callback for async writepages
21 */
22typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
23 struct ceph_msg *);
24
25/* a given osd we're communicating with */
26struct ceph_osd {
27 atomic_t o_ref;
28 struct ceph_osd_client *o_osdc;
29 int o_osd;
30 int o_incarnation;
31 struct rb_node o_node;
32 struct ceph_connection o_con;
33 struct list_head o_requests;
34 struct list_head o_osd_lru;
35 struct ceph_authorizer *o_authorizer;
36 void *o_authorizer_buf, *o_authorizer_reply_buf;
37 size_t o_authorizer_buf_len, o_authorizer_reply_buf_len;
38 unsigned long lru_ttl;
39 int o_marked_for_keepalive;
40 struct list_head o_keepalive_item;
41};
42
43/* an in-flight request */
44struct ceph_osd_request {
45 u64 r_tid; /* unique for this client */
46 struct rb_node r_node;
47 struct list_head r_req_lru_item;
48 struct list_head r_osd_item;
49 struct ceph_osd *r_osd;
50 struct ceph_pg r_pgid;
51
52 struct ceph_connection *r_con_filling_msg;
53
54 struct ceph_msg *r_request, *r_reply;
55 int r_result;
56 int r_flags; /* any additional flags for the osd */
57 u32 r_sent; /* >0 if r_request is sending/sent */
58 int r_got_reply;
59
60 struct ceph_osd_client *r_osdc;
61 struct kref r_kref;
62 bool r_mempool;
63 struct completion r_completion, r_safe_completion;
64 ceph_osdc_callback_t r_callback, r_safe_callback;
65 struct ceph_eversion r_reassert_version;
66 struct list_head r_unsafe_item;
67
68 struct inode *r_inode; /* for use by callbacks */
69 struct writeback_control *r_wbc; /* ditto */
70
71 char r_oid[40]; /* object name */
72 int r_oid_len;
73 unsigned long r_sent_stamp;
74 bool r_resend; /* msg send failed, needs retry */
75
76 struct ceph_file_layout r_file_layout;
77 struct ceph_snap_context *r_snapc; /* snap context for writes */
78 unsigned r_num_pages; /* size of page array (follows) */
79 struct page **r_pages; /* pages for data payload */
80 int r_pages_from_pool;
81 int r_own_pages; /* if true, i own page list */
82};
83
84struct ceph_osd_client {
85 struct ceph_client *client;
86
87 struct ceph_osdmap *osdmap; /* current map */
88 struct rw_semaphore map_sem;
89 struct completion map_waiters;
90 u64 last_requested_map;
91
92 struct mutex request_mutex;
93 struct rb_root osds; /* osds */
94 struct list_head osd_lru; /* idle osds */
95 u64 timeout_tid; /* tid of timeout triggering rq */
96 u64 last_tid; /* tid of last request */
97 struct rb_root requests; /* pending requests */
98 struct list_head req_lru; /* pending requests lru */
99 int num_requests;
100 struct delayed_work timeout_work;
101 struct delayed_work osds_timeout_work;
102#ifdef CONFIG_DEBUG_FS
103 struct dentry *debugfs_file;
104#endif
105
106 mempool_t *req_mempool;
107
108 struct ceph_msgpool msgpool_op;
109 struct ceph_msgpool msgpool_op_reply;
110};
111
112extern int ceph_osdc_init(struct ceph_osd_client *osdc,
113 struct ceph_client *client);
114extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
115
116extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
117 struct ceph_msg *msg);
118extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
119 struct ceph_msg *msg);
120
121extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
122 struct ceph_file_layout *layout,
123 struct ceph_vino vino,
124 u64 offset, u64 *len, int op, int flags,
125 struct ceph_snap_context *snapc,
126 int do_sync, u32 truncate_seq,
127 u64 truncate_size,
128 struct timespec *mtime,
129 bool use_mempool, int num_reply);
130
131static inline void ceph_osdc_get_request(struct ceph_osd_request *req)
132{
133 kref_get(&req->r_kref);
134}
135extern void ceph_osdc_release_request(struct kref *kref);
136static inline void ceph_osdc_put_request(struct ceph_osd_request *req)
137{
138 kref_put(&req->r_kref, ceph_osdc_release_request);
139}
140
141extern int ceph_osdc_start_request(struct ceph_osd_client *osdc,
142 struct ceph_osd_request *req,
143 bool nofail);
144extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
145 struct ceph_osd_request *req);
146extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
147
148extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
149 struct ceph_vino vino,
150 struct ceph_file_layout *layout,
151 u64 off, u64 *plen,
152 u32 truncate_seq, u64 truncate_size,
153 struct page **pages, int nr_pages);
154
155extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
156 struct ceph_vino vino,
157 struct ceph_file_layout *layout,
158 struct ceph_snap_context *sc,
159 u64 off, u64 len,
160 u32 truncate_seq, u64 truncate_size,
161 struct timespec *mtime,
162 struct page **pages, int nr_pages,
163 int flags, int do_sync, bool nofail);
164
165#endif
166
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
new file mode 100644
index 000000000000..b83f2692b835
--- /dev/null
+++ b/fs/ceph/osdmap.c
@@ -0,0 +1,1019 @@
1
2#include <asm/div64.h>
3
4#include "super.h"
5#include "osdmap.h"
6#include "crush/hash.h"
7#include "crush/mapper.h"
8#include "decode.h"
9#include "ceph_debug.h"
10
11char *ceph_osdmap_state_str(char *str, int len, int state)
12{
13 int flag = 0;
14
15 if (!len)
16 goto done;
17
18 *str = '\0';
19 if (state) {
20 if (state & CEPH_OSD_EXISTS) {
21 snprintf(str, len, "exists");
22 flag = 1;
23 }
24 if (state & CEPH_OSD_UP) {
25 snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""),
26 "up");
27 flag = 1;
28 }
29 } else {
30 snprintf(str, len, "doesn't exist");
31 }
32done:
33 return str;
34}
35
36/* maps */
37
38static int calc_bits_of(unsigned t)
39{
40 int b = 0;
41 while (t) {
42 t = t >> 1;
43 b++;
44 }
45 return b;
46}
47
48/*
49 * the foo_mask is the smallest value 2^n-1 that is >= foo.
50 */
51static void calc_pg_masks(struct ceph_pg_pool_info *pi)
52{
53 pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1;
54 pi->pgp_num_mask =
55 (1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1;
56 pi->lpg_num_mask =
57 (1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1;
58 pi->lpgp_num_mask =
59 (1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1;
60}
61
62/*
63 * decode crush map
64 */
65static int crush_decode_uniform_bucket(void **p, void *end,
66 struct crush_bucket_uniform *b)
67{
68 dout("crush_decode_uniform_bucket %p to %p\n", *p, end);
69 ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad);
70 b->item_weight = ceph_decode_32(p);
71 return 0;
72bad:
73 return -EINVAL;
74}
75
76static int crush_decode_list_bucket(void **p, void *end,
77 struct crush_bucket_list *b)
78{
79 int j;
80 dout("crush_decode_list_bucket %p to %p\n", *p, end);
81 b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
82 if (b->item_weights == NULL)
83 return -ENOMEM;
84 b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
85 if (b->sum_weights == NULL)
86 return -ENOMEM;
87 ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
88 for (j = 0; j < b->h.size; j++) {
89 b->item_weights[j] = ceph_decode_32(p);
90 b->sum_weights[j] = ceph_decode_32(p);
91 }
92 return 0;
93bad:
94 return -EINVAL;
95}
96
97static int crush_decode_tree_bucket(void **p, void *end,
98 struct crush_bucket_tree *b)
99{
100 int j;
101 dout("crush_decode_tree_bucket %p to %p\n", *p, end);
102 ceph_decode_32_safe(p, end, b->num_nodes, bad);
103 b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS);
104 if (b->node_weights == NULL)
105 return -ENOMEM;
106 ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad);
107 for (j = 0; j < b->num_nodes; j++)
108 b->node_weights[j] = ceph_decode_32(p);
109 return 0;
110bad:
111 return -EINVAL;
112}
113
114static int crush_decode_straw_bucket(void **p, void *end,
115 struct crush_bucket_straw *b)
116{
117 int j;
118 dout("crush_decode_straw_bucket %p to %p\n", *p, end);
119 b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
120 if (b->item_weights == NULL)
121 return -ENOMEM;
122 b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
123 if (b->straws == NULL)
124 return -ENOMEM;
125 ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
126 for (j = 0; j < b->h.size; j++) {
127 b->item_weights[j] = ceph_decode_32(p);
128 b->straws[j] = ceph_decode_32(p);
129 }
130 return 0;
131bad:
132 return -EINVAL;
133}
134
135static struct crush_map *crush_decode(void *pbyval, void *end)
136{
137 struct crush_map *c;
138 int err = -EINVAL;
139 int i, j;
140 void **p = &pbyval;
141 void *start = pbyval;
142 u32 magic;
143
144 dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
145
146 c = kzalloc(sizeof(*c), GFP_NOFS);
147 if (c == NULL)
148 return ERR_PTR(-ENOMEM);
149
150 ceph_decode_need(p, end, 4*sizeof(u32), bad);
151 magic = ceph_decode_32(p);
152 if (magic != CRUSH_MAGIC) {
153 pr_err("crush_decode magic %x != current %x\n",
154 (unsigned)magic, (unsigned)CRUSH_MAGIC);
155 goto bad;
156 }
157 c->max_buckets = ceph_decode_32(p);
158 c->max_rules = ceph_decode_32(p);
159 c->max_devices = ceph_decode_32(p);
160
161 c->device_parents = kcalloc(c->max_devices, sizeof(u32), GFP_NOFS);
162 if (c->device_parents == NULL)
163 goto badmem;
164 c->bucket_parents = kcalloc(c->max_buckets, sizeof(u32), GFP_NOFS);
165 if (c->bucket_parents == NULL)
166 goto badmem;
167
168 c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS);
169 if (c->buckets == NULL)
170 goto badmem;
171 c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS);
172 if (c->rules == NULL)
173 goto badmem;
174
175 /* buckets */
176 for (i = 0; i < c->max_buckets; i++) {
177 int size = 0;
178 u32 alg;
179 struct crush_bucket *b;
180
181 ceph_decode_32_safe(p, end, alg, bad);
182 if (alg == 0) {
183 c->buckets[i] = NULL;
184 continue;
185 }
186 dout("crush_decode bucket %d off %x %p to %p\n",
187 i, (int)(*p-start), *p, end);
188
189 switch (alg) {
190 case CRUSH_BUCKET_UNIFORM:
191 size = sizeof(struct crush_bucket_uniform);
192 break;
193 case CRUSH_BUCKET_LIST:
194 size = sizeof(struct crush_bucket_list);
195 break;
196 case CRUSH_BUCKET_TREE:
197 size = sizeof(struct crush_bucket_tree);
198 break;
199 case CRUSH_BUCKET_STRAW:
200 size = sizeof(struct crush_bucket_straw);
201 break;
202 default:
203 err = -EINVAL;
204 goto bad;
205 }
206 BUG_ON(size == 0);
207 b = c->buckets[i] = kzalloc(size, GFP_NOFS);
208 if (b == NULL)
209 goto badmem;
210
211 ceph_decode_need(p, end, 4*sizeof(u32), bad);
212 b->id = ceph_decode_32(p);
213 b->type = ceph_decode_16(p);
214 b->alg = ceph_decode_8(p);
215 b->hash = ceph_decode_8(p);
216 b->weight = ceph_decode_32(p);
217 b->size = ceph_decode_32(p);
218
219 dout("crush_decode bucket size %d off %x %p to %p\n",
220 b->size, (int)(*p-start), *p, end);
221
222 b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
223 if (b->items == NULL)
224 goto badmem;
225 b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS);
226 if (b->perm == NULL)
227 goto badmem;
228 b->perm_n = 0;
229
230 ceph_decode_need(p, end, b->size*sizeof(u32), bad);
231 for (j = 0; j < b->size; j++)
232 b->items[j] = ceph_decode_32(p);
233
234 switch (b->alg) {
235 case CRUSH_BUCKET_UNIFORM:
236 err = crush_decode_uniform_bucket(p, end,
237 (struct crush_bucket_uniform *)b);
238 if (err < 0)
239 goto bad;
240 break;
241 case CRUSH_BUCKET_LIST:
242 err = crush_decode_list_bucket(p, end,
243 (struct crush_bucket_list *)b);
244 if (err < 0)
245 goto bad;
246 break;
247 case CRUSH_BUCKET_TREE:
248 err = crush_decode_tree_bucket(p, end,
249 (struct crush_bucket_tree *)b);
250 if (err < 0)
251 goto bad;
252 break;
253 case CRUSH_BUCKET_STRAW:
254 err = crush_decode_straw_bucket(p, end,
255 (struct crush_bucket_straw *)b);
256 if (err < 0)
257 goto bad;
258 break;
259 }
260 }
261
262 /* rules */
263 dout("rule vec is %p\n", c->rules);
264 for (i = 0; i < c->max_rules; i++) {
265 u32 yes;
266 struct crush_rule *r;
267
268 ceph_decode_32_safe(p, end, yes, bad);
269 if (!yes) {
270 dout("crush_decode NO rule %d off %x %p to %p\n",
271 i, (int)(*p-start), *p, end);
272 c->rules[i] = NULL;
273 continue;
274 }
275
276 dout("crush_decode rule %d off %x %p to %p\n",
277 i, (int)(*p-start), *p, end);
278
279 /* len */
280 ceph_decode_32_safe(p, end, yes, bad);
281#if BITS_PER_LONG == 32
282 err = -EINVAL;
283 if (yes > ULONG_MAX / sizeof(struct crush_rule_step))
284 goto bad;
285#endif
286 r = c->rules[i] = kmalloc(sizeof(*r) +
287 yes*sizeof(struct crush_rule_step),
288 GFP_NOFS);
289 if (r == NULL)
290 goto badmem;
291 dout(" rule %d is at %p\n", i, r);
292 r->len = yes;
293 ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */
294 ceph_decode_need(p, end, r->len*3*sizeof(u32), bad);
295 for (j = 0; j < r->len; j++) {
296 r->steps[j].op = ceph_decode_32(p);
297 r->steps[j].arg1 = ceph_decode_32(p);
298 r->steps[j].arg2 = ceph_decode_32(p);
299 }
300 }
301
302 /* ignore trailing name maps. */
303
304 dout("crush_decode success\n");
305 return c;
306
307badmem:
308 err = -ENOMEM;
309bad:
310 dout("crush_decode fail %d\n", err);
311 crush_destroy(c);
312 return ERR_PTR(err);
313}
314
315
316/*
317 * osd map
318 */
319void ceph_osdmap_destroy(struct ceph_osdmap *map)
320{
321 dout("osdmap_destroy %p\n", map);
322 if (map->crush)
323 crush_destroy(map->crush);
324 while (!RB_EMPTY_ROOT(&map->pg_temp)) {
325 struct ceph_pg_mapping *pg =
326 rb_entry(rb_first(&map->pg_temp),
327 struct ceph_pg_mapping, node);
328 rb_erase(&pg->node, &map->pg_temp);
329 kfree(pg);
330 }
331 while (!RB_EMPTY_ROOT(&map->pg_pools)) {
332 struct ceph_pg_pool_info *pi =
333 rb_entry(rb_first(&map->pg_pools),
334 struct ceph_pg_pool_info, node);
335 rb_erase(&pi->node, &map->pg_pools);
336 kfree(pi);
337 }
338 kfree(map->osd_state);
339 kfree(map->osd_weight);
340 kfree(map->osd_addr);
341 kfree(map);
342}
343
344/*
345 * adjust max osd value. reallocate arrays.
346 */
347static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
348{
349 u8 *state;
350 struct ceph_entity_addr *addr;
351 u32 *weight;
352
353 state = kcalloc(max, sizeof(*state), GFP_NOFS);
354 addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
355 weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
356 if (state == NULL || addr == NULL || weight == NULL) {
357 kfree(state);
358 kfree(addr);
359 kfree(weight);
360 return -ENOMEM;
361 }
362
363 /* copy old? */
364 if (map->osd_state) {
365 memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
366 memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
367 memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
368 kfree(map->osd_state);
369 kfree(map->osd_addr);
370 kfree(map->osd_weight);
371 }
372
373 map->osd_state = state;
374 map->osd_weight = weight;
375 map->osd_addr = addr;
376 map->max_osd = max;
377 return 0;
378}
379
380/*
381 * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
382 * to a set of osds)
383 */
384static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
385{
386 u64 a = *(u64 *)&l;
387 u64 b = *(u64 *)&r;
388
389 if (a < b)
390 return -1;
391 if (a > b)
392 return 1;
393 return 0;
394}
395
396static int __insert_pg_mapping(struct ceph_pg_mapping *new,
397 struct rb_root *root)
398{
399 struct rb_node **p = &root->rb_node;
400 struct rb_node *parent = NULL;
401 struct ceph_pg_mapping *pg = NULL;
402 int c;
403
404 while (*p) {
405 parent = *p;
406 pg = rb_entry(parent, struct ceph_pg_mapping, node);
407 c = pgid_cmp(new->pgid, pg->pgid);
408 if (c < 0)
409 p = &(*p)->rb_left;
410 else if (c > 0)
411 p = &(*p)->rb_right;
412 else
413 return -EEXIST;
414 }
415
416 rb_link_node(&new->node, parent, p);
417 rb_insert_color(&new->node, root);
418 return 0;
419}
420
421static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
422 struct ceph_pg pgid)
423{
424 struct rb_node *n = root->rb_node;
425 struct ceph_pg_mapping *pg;
426 int c;
427
428 while (n) {
429 pg = rb_entry(n, struct ceph_pg_mapping, node);
430 c = pgid_cmp(pgid, pg->pgid);
431 if (c < 0)
432 n = n->rb_left;
433 else if (c > 0)
434 n = n->rb_right;
435 else
436 return pg;
437 }
438 return NULL;
439}
440
441/*
442 * rbtree of pg pool info
443 */
444static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new)
445{
446 struct rb_node **p = &root->rb_node;
447 struct rb_node *parent = NULL;
448 struct ceph_pg_pool_info *pi = NULL;
449
450 while (*p) {
451 parent = *p;
452 pi = rb_entry(parent, struct ceph_pg_pool_info, node);
453 if (new->id < pi->id)
454 p = &(*p)->rb_left;
455 else if (new->id > pi->id)
456 p = &(*p)->rb_right;
457 else
458 return -EEXIST;
459 }
460
461 rb_link_node(&new->node, parent, p);
462 rb_insert_color(&new->node, root);
463 return 0;
464}
465
466static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
467{
468 struct ceph_pg_pool_info *pi;
469 struct rb_node *n = root->rb_node;
470
471 while (n) {
472 pi = rb_entry(n, struct ceph_pg_pool_info, node);
473 if (id < pi->id)
474 n = n->rb_left;
475 else if (id > pi->id)
476 n = n->rb_right;
477 else
478 return pi;
479 }
480 return NULL;
481}
482
483/*
484 * decode a full map.
485 */
486struct ceph_osdmap *osdmap_decode(void **p, void *end)
487{
488 struct ceph_osdmap *map;
489 u16 version;
490 u32 len, max, i;
491 u8 ev;
492 int err = -EINVAL;
493 void *start = *p;
494 struct ceph_pg_pool_info *pi;
495
496 dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p));
497
498 map = kzalloc(sizeof(*map), GFP_NOFS);
499 if (map == NULL)
500 return ERR_PTR(-ENOMEM);
501 map->pg_temp = RB_ROOT;
502
503 ceph_decode_16_safe(p, end, version, bad);
504 if (version > CEPH_OSDMAP_VERSION) {
505 pr_warning("got unknown v %d > %d of osdmap\n", version,
506 CEPH_OSDMAP_VERSION);
507 goto bad;
508 }
509
510 ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad);
511 ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
512 map->epoch = ceph_decode_32(p);
513 ceph_decode_copy(p, &map->created, sizeof(map->created));
514 ceph_decode_copy(p, &map->modified, sizeof(map->modified));
515
516 ceph_decode_32_safe(p, end, max, bad);
517 while (max--) {
518 ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
519 pi = kmalloc(sizeof(*pi), GFP_NOFS);
520 if (!pi)
521 goto bad;
522 pi->id = ceph_decode_32(p);
523 ev = ceph_decode_8(p); /* encoding version */
524 if (ev > CEPH_PG_POOL_VERSION) {
525 pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
526 ev, CEPH_PG_POOL_VERSION);
527 goto bad;
528 }
529 ceph_decode_copy(p, &pi->v, sizeof(pi->v));
530 __insert_pg_pool(&map->pg_pools, pi);
531 calc_pg_masks(pi);
532 *p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64);
533 *p += le32_to_cpu(pi->v.num_removed_snap_intervals)
534 * sizeof(u64) * 2;
535 }
536 ceph_decode_32_safe(p, end, map->pool_max, bad);
537
538 ceph_decode_32_safe(p, end, map->flags, bad);
539
540 max = ceph_decode_32(p);
541
542 /* (re)alloc osd arrays */
543 err = osdmap_set_max_osd(map, max);
544 if (err < 0)
545 goto bad;
546 dout("osdmap_decode max_osd = %d\n", map->max_osd);
547
548 /* osds */
549 err = -EINVAL;
550 ceph_decode_need(p, end, 3*sizeof(u32) +
551 map->max_osd*(1 + sizeof(*map->osd_weight) +
552 sizeof(*map->osd_addr)), bad);
553 *p += 4; /* skip length field (should match max) */
554 ceph_decode_copy(p, map->osd_state, map->max_osd);
555
556 *p += 4; /* skip length field (should match max) */
557 for (i = 0; i < map->max_osd; i++)
558 map->osd_weight[i] = ceph_decode_32(p);
559
560 *p += 4; /* skip length field (should match max) */
561 ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr));
562 for (i = 0; i < map->max_osd; i++)
563 ceph_decode_addr(&map->osd_addr[i]);
564
565 /* pg_temp */
566 ceph_decode_32_safe(p, end, len, bad);
567 for (i = 0; i < len; i++) {
568 int n, j;
569 struct ceph_pg pgid;
570 struct ceph_pg_mapping *pg;
571
572 ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad);
573 ceph_decode_copy(p, &pgid, sizeof(pgid));
574 n = ceph_decode_32(p);
575 ceph_decode_need(p, end, n * sizeof(u32), bad);
576 err = -ENOMEM;
577 pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);
578 if (!pg)
579 goto bad;
580 pg->pgid = pgid;
581 pg->len = n;
582 for (j = 0; j < n; j++)
583 pg->osds[j] = ceph_decode_32(p);
584
585 err = __insert_pg_mapping(pg, &map->pg_temp);
586 if (err)
587 goto bad;
588 dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, len);
589 }
590
591 /* crush */
592 ceph_decode_32_safe(p, end, len, bad);
593 dout("osdmap_decode crush len %d from off 0x%x\n", len,
594 (int)(*p - start));
595 ceph_decode_need(p, end, len, bad);
596 map->crush = crush_decode(*p, end);
597 *p += len;
598 if (IS_ERR(map->crush)) {
599 err = PTR_ERR(map->crush);
600 map->crush = NULL;
601 goto bad;
602 }
603
604 /* ignore the rest of the map */
605 *p = end;
606
607 dout("osdmap_decode done %p %p\n", *p, end);
608 return map;
609
610bad:
611 dout("osdmap_decode fail\n");
612 ceph_osdmap_destroy(map);
613 return ERR_PTR(err);
614}
615
616/*
617 * decode and apply an incremental map update.
618 */
619struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
620 struct ceph_osdmap *map,
621 struct ceph_messenger *msgr)
622{
623 struct crush_map *newcrush = NULL;
624 struct ceph_fsid fsid;
625 u32 epoch = 0;
626 struct ceph_timespec modified;
627 u32 len, pool;
628 __s32 new_pool_max, new_flags, max;
629 void *start = *p;
630 int err = -EINVAL;
631 u16 version;
632 struct rb_node *rbp;
633
634 ceph_decode_16_safe(p, end, version, bad);
635 if (version > CEPH_OSDMAP_INC_VERSION) {
636 pr_warning("got unknown v %d > %d of inc osdmap\n", version,
637 CEPH_OSDMAP_INC_VERSION);
638 goto bad;
639 }
640
641 ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32),
642 bad);
643 ceph_decode_copy(p, &fsid, sizeof(fsid));
644 epoch = ceph_decode_32(p);
645 BUG_ON(epoch != map->epoch+1);
646 ceph_decode_copy(p, &modified, sizeof(modified));
647 new_pool_max = ceph_decode_32(p);
648 new_flags = ceph_decode_32(p);
649
650 /* full map? */
651 ceph_decode_32_safe(p, end, len, bad);
652 if (len > 0) {
653 dout("apply_incremental full map len %d, %p to %p\n",
654 len, *p, end);
655 return osdmap_decode(p, min(*p+len, end));
656 }
657
658 /* new crush? */
659 ceph_decode_32_safe(p, end, len, bad);
660 if (len > 0) {
661 dout("apply_incremental new crush map len %d, %p to %p\n",
662 len, *p, end);
663 newcrush = crush_decode(*p, min(*p+len, end));
664 if (IS_ERR(newcrush))
665 return ERR_PTR(PTR_ERR(newcrush));
666 }
667
668 /* new flags? */
669 if (new_flags >= 0)
670 map->flags = new_flags;
671 if (new_pool_max >= 0)
672 map->pool_max = new_pool_max;
673
674 ceph_decode_need(p, end, 5*sizeof(u32), bad);
675
676 /* new max? */
677 max = ceph_decode_32(p);
678 if (max >= 0) {
679 err = osdmap_set_max_osd(map, max);
680 if (err < 0)
681 goto bad;
682 }
683
684 map->epoch++;
685 map->modified = map->modified;
686 if (newcrush) {
687 if (map->crush)
688 crush_destroy(map->crush);
689 map->crush = newcrush;
690 newcrush = NULL;
691 }
692
693 /* new_pool */
694 ceph_decode_32_safe(p, end, len, bad);
695 while (len--) {
696 __u8 ev;
697 struct ceph_pg_pool_info *pi;
698
699 ceph_decode_32_safe(p, end, pool, bad);
700 ceph_decode_need(p, end, 1 + sizeof(pi->v), bad);
701 ev = ceph_decode_8(p); /* encoding version */
702 if (ev > CEPH_PG_POOL_VERSION) {
703 pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
704 ev, CEPH_PG_POOL_VERSION);
705 goto bad;
706 }
707 pi = __lookup_pg_pool(&map->pg_pools, pool);
708 if (!pi) {
709 pi = kmalloc(sizeof(*pi), GFP_NOFS);
710 if (!pi) {
711 err = -ENOMEM;
712 goto bad;
713 }
714 pi->id = pool;
715 __insert_pg_pool(&map->pg_pools, pi);
716 }
717 ceph_decode_copy(p, &pi->v, sizeof(pi->v));
718 calc_pg_masks(pi);
719 }
720
721 /* old_pool */
722 ceph_decode_32_safe(p, end, len, bad);
723 while (len--) {
724 struct ceph_pg_pool_info *pi;
725
726 ceph_decode_32_safe(p, end, pool, bad);
727 pi = __lookup_pg_pool(&map->pg_pools, pool);
728 if (pi) {
729 rb_erase(&pi->node, &map->pg_pools);
730 kfree(pi);
731 }
732 }
733
734 /* new_up */
735 err = -EINVAL;
736 ceph_decode_32_safe(p, end, len, bad);
737 while (len--) {
738 u32 osd;
739 struct ceph_entity_addr addr;
740 ceph_decode_32_safe(p, end, osd, bad);
741 ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad);
742 ceph_decode_addr(&addr);
743 pr_info("osd%d up\n", osd);
744 BUG_ON(osd >= map->max_osd);
745 map->osd_state[osd] |= CEPH_OSD_UP;
746 map->osd_addr[osd] = addr;
747 }
748
749 /* new_down */
750 ceph_decode_32_safe(p, end, len, bad);
751 while (len--) {
752 u32 osd;
753 ceph_decode_32_safe(p, end, osd, bad);
754 (*p)++; /* clean flag */
755 pr_info("osd%d down\n", osd);
756 if (osd < map->max_osd)
757 map->osd_state[osd] &= ~CEPH_OSD_UP;
758 }
759
760 /* new_weight */
761 ceph_decode_32_safe(p, end, len, bad);
762 while (len--) {
763 u32 osd, off;
764 ceph_decode_need(p, end, sizeof(u32)*2, bad);
765 osd = ceph_decode_32(p);
766 off = ceph_decode_32(p);
767 pr_info("osd%d weight 0x%x %s\n", osd, off,
768 off == CEPH_OSD_IN ? "(in)" :
769 (off == CEPH_OSD_OUT ? "(out)" : ""));
770 if (osd < map->max_osd)
771 map->osd_weight[osd] = off;
772 }
773
774 /* new_pg_temp */
775 rbp = rb_first(&map->pg_temp);
776 ceph_decode_32_safe(p, end, len, bad);
777 while (len--) {
778 struct ceph_pg_mapping *pg;
779 int j;
780 struct ceph_pg pgid;
781 u32 pglen;
782 ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
783 ceph_decode_copy(p, &pgid, sizeof(pgid));
784 pglen = ceph_decode_32(p);
785
786 /* remove any? */
787 while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping,
788 node)->pgid, pgid) <= 0) {
789 struct rb_node *cur = rbp;
790 rbp = rb_next(rbp);
791 dout(" removed pg_temp %llx\n",
792 *(u64 *)&rb_entry(cur, struct ceph_pg_mapping,
793 node)->pgid);
794 rb_erase(cur, &map->pg_temp);
795 }
796
797 if (pglen) {
798 /* insert */
799 ceph_decode_need(p, end, pglen*sizeof(u32), bad);
800 pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
801 if (!pg) {
802 err = -ENOMEM;
803 goto bad;
804 }
805 pg->pgid = pgid;
806 pg->len = pglen;
807 for (j = 0; j < pglen; j++)
808 pg->osds[j] = ceph_decode_32(p);
809 err = __insert_pg_mapping(pg, &map->pg_temp);
810 if (err)
811 goto bad;
812 dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid,
813 pglen);
814 }
815 }
816 while (rbp) {
817 struct rb_node *cur = rbp;
818 rbp = rb_next(rbp);
819 dout(" removed pg_temp %llx\n",
820 *(u64 *)&rb_entry(cur, struct ceph_pg_mapping,
821 node)->pgid);
822 rb_erase(cur, &map->pg_temp);
823 }
824
825 /* ignore the rest */
826 *p = end;
827 return map;
828
829bad:
830 pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n",
831 epoch, (int)(*p - start), *p, start, end);
832 print_hex_dump(KERN_DEBUG, "osdmap: ",
833 DUMP_PREFIX_OFFSET, 16, 1,
834 start, end - start, true);
835 if (newcrush)
836 crush_destroy(newcrush);
837 return ERR_PTR(err);
838}
839
840
841
842
843/*
844 * calculate file layout from given offset, length.
845 * fill in correct oid, logical length, and object extent
846 * offset, length.
847 *
848 * for now, we write only a single su, until we can
849 * pass a stride back to the caller.
850 */
851void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
852 u64 off, u64 *plen,
853 u64 *ono,
854 u64 *oxoff, u64 *oxlen)
855{
856 u32 osize = le32_to_cpu(layout->fl_object_size);
857 u32 su = le32_to_cpu(layout->fl_stripe_unit);
858 u32 sc = le32_to_cpu(layout->fl_stripe_count);
859 u32 bl, stripeno, stripepos, objsetno;
860 u32 su_per_object;
861 u64 t, su_offset;
862
863 dout("mapping %llu~%llu osize %u fl_su %u\n", off, *plen,
864 osize, su);
865 su_per_object = osize / su;
866 dout("osize %u / su %u = su_per_object %u\n", osize, su,
867 su_per_object);
868
869 BUG_ON((su & ~PAGE_MASK) != 0);
870 /* bl = *off / su; */
871 t = off;
872 do_div(t, su);
873 bl = t;
874 dout("off %llu / su %u = bl %u\n", off, su, bl);
875
876 stripeno = bl / sc;
877 stripepos = bl % sc;
878 objsetno = stripeno / su_per_object;
879
880 *ono = objsetno * sc + stripepos;
881 dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned)*ono);
882
883 /* *oxoff = *off % layout->fl_stripe_unit; # offset in su */
884 t = off;
885 su_offset = do_div(t, su);
886 *oxoff = su_offset + (stripeno % su_per_object) * su;
887
888 /*
889 * Calculate the length of the extent being written to the selected
890 * object. This is the minimum of the full length requested (plen) or
891 * the remainder of the current stripe being written to.
892 */
893 *oxlen = min_t(u64, *plen, su - su_offset);
894 *plen = *oxlen;
895
896 dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
897}
898
899/*
900 * calculate an object layout (i.e. pgid) from an oid,
901 * file_layout, and osdmap
902 */
903int ceph_calc_object_layout(struct ceph_object_layout *ol,
904 const char *oid,
905 struct ceph_file_layout *fl,
906 struct ceph_osdmap *osdmap)
907{
908 unsigned num, num_mask;
909 struct ceph_pg pgid;
910 s32 preferred = (s32)le32_to_cpu(fl->fl_pg_preferred);
911 int poolid = le32_to_cpu(fl->fl_pg_pool);
912 struct ceph_pg_pool_info *pool;
913 unsigned ps;
914
915 BUG_ON(!osdmap);
916
917 pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
918 if (!pool)
919 return -EIO;
920 ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid));
921 if (preferred >= 0) {
922 ps += preferred;
923 num = le32_to_cpu(pool->v.lpg_num);
924 num_mask = pool->lpg_num_mask;
925 } else {
926 num = le32_to_cpu(pool->v.pg_num);
927 num_mask = pool->pg_num_mask;
928 }
929
930 pgid.ps = cpu_to_le16(ps);
931 pgid.preferred = cpu_to_le16(preferred);
932 pgid.pool = fl->fl_pg_pool;
933 if (preferred >= 0)
934 dout("calc_object_layout '%s' pgid %d.%xp%d\n", oid, poolid, ps,
935 (int)preferred);
936 else
937 dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps);
938
939 ol->ol_pgid = pgid;
940 ol->ol_stripe_unit = fl->fl_object_stripe_unit;
941 return 0;
942}
943
944/*
945 * Calculate raw osd vector for the given pgid. Return pointer to osd
946 * array, or NULL on failure.
947 */
948static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
949 int *osds, int *num)
950{
951 struct ceph_pg_mapping *pg;
952 struct ceph_pg_pool_info *pool;
953 int ruleno;
954 unsigned poolid, ps, pps;
955 int preferred;
956
957 /* pg_temp? */
958 pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
959 if (pg) {
960 *num = pg->len;
961 return pg->osds;
962 }
963
964 /* crush */
965 poolid = le32_to_cpu(pgid.pool);
966 ps = le16_to_cpu(pgid.ps);
967 preferred = (s16)le16_to_cpu(pgid.preferred);
968
969 /* don't forcefeed bad device ids to crush */
970 if (preferred >= osdmap->max_osd ||
971 preferred >= osdmap->crush->max_devices)
972 preferred = -1;
973
974 pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
975 if (!pool)
976 return NULL;
977 ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset,
978 pool->v.type, pool->v.size);
979 if (ruleno < 0) {
980 pr_err("no crush rule pool %d type %d size %d\n",
981 poolid, pool->v.type, pool->v.size);
982 return NULL;
983 }
984
985 if (preferred >= 0)
986 pps = ceph_stable_mod(ps,
987 le32_to_cpu(pool->v.lpgp_num),
988 pool->lpgp_num_mask);
989 else
990 pps = ceph_stable_mod(ps,
991 le32_to_cpu(pool->v.pgp_num),
992 pool->pgp_num_mask);
993 pps += poolid;
994 *num = crush_do_rule(osdmap->crush, ruleno, pps, osds,
995 min_t(int, pool->v.size, *num),
996 preferred, osdmap->osd_weight);
997 return osds;
998}
999
1000/*
1001 * Return primary osd for given pgid, or -1 if none.
1002 */
1003int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
1004{
1005 int rawosds[10], *osds;
1006 int i, num = ARRAY_SIZE(rawosds);
1007
1008 osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
1009 if (!osds)
1010 return -1;
1011
1012 /* primary is first up osd */
1013 for (i = 0; i < num; i++)
1014 if (ceph_osd_is_up(osdmap, osds[i])) {
1015 return osds[i];
1016 break;
1017 }
1018 return -1;
1019}
diff --git a/fs/ceph/osdmap.h b/fs/ceph/osdmap.h
new file mode 100644
index 000000000000..1fb55afb2642
--- /dev/null
+++ b/fs/ceph/osdmap.h
@@ -0,0 +1,125 @@
1#ifndef _FS_CEPH_OSDMAP_H
2#define _FS_CEPH_OSDMAP_H
3
4#include <linux/rbtree.h>
5#include "types.h"
6#include "ceph_fs.h"
7#include "crush/crush.h"
8
9/*
10 * The osd map describes the current membership of the osd cluster and
11 * specifies the mapping of objects to placement groups and placement
12 * groups to (sets of) osds. That is, it completely specifies the
13 * (desired) distribution of all data objects in the system at some
14 * point in time.
15 *
16 * Each map version is identified by an epoch, which increases monotonically.
17 *
18 * The map can be updated either via an incremental map (diff) describing
19 * the change between two successive epochs, or as a fully encoded map.
20 */
21struct ceph_pg_pool_info {
22 struct rb_node node;
23 int id;
24 struct ceph_pg_pool v;
25 int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask;
26};
27
28struct ceph_pg_mapping {
29 struct rb_node node;
30 struct ceph_pg pgid;
31 int len;
32 int osds[];
33};
34
35struct ceph_osdmap {
36 struct ceph_fsid fsid;
37 u32 epoch;
38 u32 mkfs_epoch;
39 struct ceph_timespec created, modified;
40
41 u32 flags; /* CEPH_OSDMAP_* */
42
43 u32 max_osd; /* size of osd_state, _offload, _addr arrays */
44 u8 *osd_state; /* CEPH_OSD_* */
45 u32 *osd_weight; /* 0 = failed, 0x10000 = 100% normal */
46 struct ceph_entity_addr *osd_addr;
47
48 struct rb_root pg_temp;
49 struct rb_root pg_pools;
50 u32 pool_max;
51
52 /* the CRUSH map specifies the mapping of placement groups to
53 * the list of osds that store+replicate them. */
54 struct crush_map *crush;
55};
56
57/*
58 * file layout helpers
59 */
60#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit))
61#define ceph_file_layout_stripe_count(l) \
62 ((__s32)le32_to_cpu((l).fl_stripe_count))
63#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size))
64#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash))
65#define ceph_file_layout_object_su(l) \
66 ((__s32)le32_to_cpu((l).fl_object_stripe_unit))
67#define ceph_file_layout_pg_preferred(l) \
68 ((__s32)le32_to_cpu((l).fl_pg_preferred))
69#define ceph_file_layout_pg_pool(l) \
70 ((__s32)le32_to_cpu((l).fl_pg_pool))
71
72static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l)
73{
74 return le32_to_cpu(l->fl_stripe_unit) *
75 le32_to_cpu(l->fl_stripe_count);
76}
77
78/* "period" == bytes before i start on a new set of objects */
79static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l)
80{
81 return le32_to_cpu(l->fl_object_size) *
82 le32_to_cpu(l->fl_stripe_count);
83}
84
85
86static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)
87{
88 return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP);
89}
90
91static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag)
92{
93 return map && (map->flags & flag);
94}
95
96extern char *ceph_osdmap_state_str(char *str, int len, int state);
97
98static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
99 int osd)
100{
101 if (osd >= map->max_osd)
102 return NULL;
103 return &map->osd_addr[osd];
104}
105
106extern struct ceph_osdmap *osdmap_decode(void **p, void *end);
107extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
108 struct ceph_osdmap *map,
109 struct ceph_messenger *msgr);
110extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
111
112/* calculate mapping of a file extent to an object */
113extern void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
114 u64 off, u64 *plen,
115 u64 *bno, u64 *oxoff, u64 *oxlen);
116
117/* calculate mapping of object to a placement group */
118extern int ceph_calc_object_layout(struct ceph_object_layout *ol,
119 const char *oid,
120 struct ceph_file_layout *fl,
121 struct ceph_osdmap *osdmap);
122extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
123 struct ceph_pg pgid);
124
125#endif
diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c
new file mode 100644
index 000000000000..370e93695474
--- /dev/null
+++ b/fs/ceph/pagelist.c
@@ -0,0 +1,54 @@
1
2#include <linux/pagemap.h>
3#include <linux/highmem.h>
4
5#include "pagelist.h"
6
7int ceph_pagelist_release(struct ceph_pagelist *pl)
8{
9 if (pl->mapped_tail)
10 kunmap(pl->mapped_tail);
11 while (!list_empty(&pl->head)) {
12 struct page *page = list_first_entry(&pl->head, struct page,
13 lru);
14 list_del(&page->lru);
15 __free_page(page);
16 }
17 return 0;
18}
19
20static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
21{
22 struct page *page = alloc_page(GFP_NOFS);
23 if (!page)
24 return -ENOMEM;
25 pl->room += PAGE_SIZE;
26 list_add_tail(&page->lru, &pl->head);
27 if (pl->mapped_tail)
28 kunmap(pl->mapped_tail);
29 pl->mapped_tail = kmap(page);
30 return 0;
31}
32
33int ceph_pagelist_append(struct ceph_pagelist *pl, void *buf, size_t len)
34{
35 while (pl->room < len) {
36 size_t bit = pl->room;
37 int ret;
38
39 memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK),
40 buf, bit);
41 pl->length += bit;
42 pl->room -= bit;
43 buf += bit;
44 len -= bit;
45 ret = ceph_pagelist_addpage(pl);
46 if (ret)
47 return ret;
48 }
49
50 memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len);
51 pl->length += len;
52 pl->room -= len;
53 return 0;
54}
diff --git a/fs/ceph/pagelist.h b/fs/ceph/pagelist.h
new file mode 100644
index 000000000000..e8a4187e1087
--- /dev/null
+++ b/fs/ceph/pagelist.h
@@ -0,0 +1,54 @@
1#ifndef __FS_CEPH_PAGELIST_H
2#define __FS_CEPH_PAGELIST_H
3
4#include <linux/list.h>
5
6struct ceph_pagelist {
7 struct list_head head;
8 void *mapped_tail;
9 size_t length;
10 size_t room;
11};
12
13static inline void ceph_pagelist_init(struct ceph_pagelist *pl)
14{
15 INIT_LIST_HEAD(&pl->head);
16 pl->mapped_tail = NULL;
17 pl->length = 0;
18 pl->room = 0;
19}
20extern int ceph_pagelist_release(struct ceph_pagelist *pl);
21
22extern int ceph_pagelist_append(struct ceph_pagelist *pl, void *d, size_t l);
23
24static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v)
25{
26 __le64 ev = cpu_to_le64(v);
27 return ceph_pagelist_append(pl, &ev, sizeof(ev));
28}
29static inline int ceph_pagelist_encode_32(struct ceph_pagelist *pl, u32 v)
30{
31 __le32 ev = cpu_to_le32(v);
32 return ceph_pagelist_append(pl, &ev, sizeof(ev));
33}
34static inline int ceph_pagelist_encode_16(struct ceph_pagelist *pl, u16 v)
35{
36 __le16 ev = cpu_to_le16(v);
37 return ceph_pagelist_append(pl, &ev, sizeof(ev));
38}
39static inline int ceph_pagelist_encode_8(struct ceph_pagelist *pl, u8 v)
40{
41 return ceph_pagelist_append(pl, &v, 1);
42}
43static inline int ceph_pagelist_encode_string(struct ceph_pagelist *pl,
44 char *s, size_t len)
45{
46 int ret = ceph_pagelist_encode_32(pl, len);
47 if (ret)
48 return ret;
49 if (len)
50 return ceph_pagelist_append(pl, s, len);
51 return 0;
52}
53
54#endif
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
new file mode 100644
index 000000000000..26ac8b89a676
--- /dev/null
+++ b/fs/ceph/rados.h
@@ -0,0 +1,374 @@
1#ifndef __RADOS_H
2#define __RADOS_H
3
4/*
5 * Data types for the Ceph distributed object storage layer RADOS
6 * (Reliable Autonomic Distributed Object Store).
7 */
8
9#include "msgr.h"
10
11/*
12 * osdmap encoding versions
13 */
14#define CEPH_OSDMAP_INC_VERSION 4
15#define CEPH_OSDMAP_VERSION 4
16
17/*
18 * fs id
19 */
20struct ceph_fsid {
21 unsigned char fsid[16];
22};
23
24static inline int ceph_fsid_compare(const struct ceph_fsid *a,
25 const struct ceph_fsid *b)
26{
27 return memcmp(a, b, sizeof(*a));
28}
29
30/*
31 * ino, object, etc.
32 */
33typedef __le64 ceph_snapid_t;
34#define CEPH_SNAPDIR ((__u64)(-1)) /* reserved for hidden .snap dir */
35#define CEPH_NOSNAP ((__u64)(-2)) /* "head", "live" revision */
36#define CEPH_MAXSNAP ((__u64)(-3)) /* largest valid snapid */
37
38struct ceph_timespec {
39 __le32 tv_sec;
40 __le32 tv_nsec;
41} __attribute__ ((packed));
42
43
44/*
45 * object layout - how objects are mapped into PGs
46 */
47#define CEPH_OBJECT_LAYOUT_HASH 1
48#define CEPH_OBJECT_LAYOUT_LINEAR 2
49#define CEPH_OBJECT_LAYOUT_HASHINO 3
50
51/*
52 * pg layout -- how PGs are mapped onto (sets of) OSDs
53 */
54#define CEPH_PG_LAYOUT_CRUSH 0
55#define CEPH_PG_LAYOUT_HASH 1
56#define CEPH_PG_LAYOUT_LINEAR 2
57#define CEPH_PG_LAYOUT_HYBRID 3
58
59
60/*
61 * placement group.
62 * we encode this into one __le64.
63 */
64struct ceph_pg {
65 __le16 preferred; /* preferred primary osd */
66 __le16 ps; /* placement seed */
67 __le32 pool; /* object pool */
68} __attribute__ ((packed));
69
70/*
71 * pg_pool is a set of pgs storing a pool of objects
72 *
73 * pg_num -- base number of pseudorandomly placed pgs
74 *
75 * pgp_num -- effective number when calculating pg placement. this
76 * is used for pg_num increases. new pgs result in data being "split"
77 * into new pgs. for this to proceed smoothly, new pgs are intiially
78 * colocated with their parents; that is, pgp_num doesn't increase
79 * until the new pgs have successfully split. only _then_ are the new
80 * pgs placed independently.
81 *
82 * lpg_num -- localized pg count (per device). replicas are randomly
83 * selected.
84 *
85 * lpgp_num -- as above.
86 */
87#define CEPH_PG_TYPE_REP 1
88#define CEPH_PG_TYPE_RAID4 2
89#define CEPH_PG_POOL_VERSION 2
90struct ceph_pg_pool {
91 __u8 type; /* CEPH_PG_TYPE_* */
92 __u8 size; /* number of osds in each pg */
93 __u8 crush_ruleset; /* crush placement rule */
94 __u8 object_hash; /* hash mapping object name to ps */
95 __le32 pg_num, pgp_num; /* number of pg's */
96 __le32 lpg_num, lpgp_num; /* number of localized pg's */
97 __le32 last_change; /* most recent epoch changed */
98 __le64 snap_seq; /* seq for per-pool snapshot */
99 __le32 snap_epoch; /* epoch of last snap */
100 __le32 num_snaps;
101 __le32 num_removed_snap_intervals;
102 __le64 uid;
103} __attribute__ ((packed));
104
105/*
106 * stable_mod func is used to control number of placement groups.
107 * similar to straight-up modulo, but produces a stable mapping as b
108 * increases over time. b is the number of bins, and bmask is the
109 * containing power of 2 minus 1.
110 *
111 * b <= bmask and bmask=(2**n)-1
112 * e.g., b=12 -> bmask=15, b=123 -> bmask=127
113 */
114static inline int ceph_stable_mod(int x, int b, int bmask)
115{
116 if ((x & bmask) < b)
117 return x & bmask;
118 else
119 return x & (bmask >> 1);
120}
121
122/*
123 * object layout - how a given object should be stored.
124 */
125struct ceph_object_layout {
126 struct ceph_pg ol_pgid; /* raw pg, with _full_ ps precision. */
127 __le32 ol_stripe_unit; /* for per-object parity, if any */
128} __attribute__ ((packed));
129
130/*
131 * compound epoch+version, used by storage layer to serialize mutations
132 */
133struct ceph_eversion {
134 __le32 epoch;
135 __le64 version;
136} __attribute__ ((packed));
137
138/*
139 * osd map bits
140 */
141
142/* status bits */
143#define CEPH_OSD_EXISTS 1
144#define CEPH_OSD_UP 2
145
146/* osd weights. fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */
147#define CEPH_OSD_IN 0x10000
148#define CEPH_OSD_OUT 0
149
150
151/*
152 * osd map flag bits
153 */
154#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC) */
155#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC) */
156#define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */
157#define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */
158#define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */
159
160/*
161 * osd ops
162 */
163#define CEPH_OSD_OP_MODE 0xf000
164#define CEPH_OSD_OP_MODE_RD 0x1000
165#define CEPH_OSD_OP_MODE_WR 0x2000
166#define CEPH_OSD_OP_MODE_RMW 0x3000
167#define CEPH_OSD_OP_MODE_SUB 0x4000
168
169#define CEPH_OSD_OP_TYPE 0x0f00
170#define CEPH_OSD_OP_TYPE_LOCK 0x0100
171#define CEPH_OSD_OP_TYPE_DATA 0x0200
172#define CEPH_OSD_OP_TYPE_ATTR 0x0300
173#define CEPH_OSD_OP_TYPE_EXEC 0x0400
174#define CEPH_OSD_OP_TYPE_PG 0x0500
175
176enum {
177 /** data **/
178 /* read */
179 CEPH_OSD_OP_READ = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1,
180 CEPH_OSD_OP_STAT = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2,
181
182 /* fancy read */
183 CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4,
184
185 /* write */
186 CEPH_OSD_OP_WRITE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1,
187 CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2,
188 CEPH_OSD_OP_TRUNCATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 3,
189 CEPH_OSD_OP_ZERO = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 4,
190 CEPH_OSD_OP_DELETE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 5,
191
192 /* fancy write */
193 CEPH_OSD_OP_APPEND = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 6,
194 CEPH_OSD_OP_STARTSYNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 7,
195 CEPH_OSD_OP_SETTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 8,
196 CEPH_OSD_OP_TRIMTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 9,
197
198 CEPH_OSD_OP_TMAPUP = CEPH_OSD_OP_MODE_RMW | CEPH_OSD_OP_TYPE_DATA | 10,
199 CEPH_OSD_OP_TMAPPUT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 11,
200 CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12,
201
202 CEPH_OSD_OP_CREATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13,
203
204 /** attrs **/
205 /* read */
206 CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
207 CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
208
209 /* write */
210 CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
211 CEPH_OSD_OP_SETXATTRS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 2,
212 CEPH_OSD_OP_RESETXATTRS = CEPH_OSD_OP_MODE_WR|CEPH_OSD_OP_TYPE_ATTR | 3,
213 CEPH_OSD_OP_RMXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4,
214
215 /** subop **/
216 CEPH_OSD_OP_PULL = CEPH_OSD_OP_MODE_SUB | 1,
217 CEPH_OSD_OP_PUSH = CEPH_OSD_OP_MODE_SUB | 2,
218 CEPH_OSD_OP_BALANCEREADS = CEPH_OSD_OP_MODE_SUB | 3,
219 CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4,
220 CEPH_OSD_OP_SCRUB = CEPH_OSD_OP_MODE_SUB | 5,
221
222 /** lock **/
223 CEPH_OSD_OP_WRLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1,
224 CEPH_OSD_OP_WRUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 2,
225 CEPH_OSD_OP_RDLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 3,
226 CEPH_OSD_OP_RDUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 4,
227 CEPH_OSD_OP_UPLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 5,
228 CEPH_OSD_OP_DNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6,
229
230 /** exec **/
231 CEPH_OSD_OP_CALL = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1,
232
233 /** pg **/
234 CEPH_OSD_OP_PGLS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1,
235};
236
237static inline int ceph_osd_op_type_lock(int op)
238{
239 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_LOCK;
240}
241static inline int ceph_osd_op_type_data(int op)
242{
243 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA;
244}
245static inline int ceph_osd_op_type_attr(int op)
246{
247 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR;
248}
249static inline int ceph_osd_op_type_exec(int op)
250{
251 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC;
252}
253static inline int ceph_osd_op_type_pg(int op)
254{
255 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG;
256}
257
258static inline int ceph_osd_op_mode_subop(int op)
259{
260 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB;
261}
262static inline int ceph_osd_op_mode_read(int op)
263{
264 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD;
265}
266static inline int ceph_osd_op_mode_modify(int op)
267{
268 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR;
269}
270
271#define CEPH_OSD_TMAP_HDR 'h'
272#define CEPH_OSD_TMAP_SET 's'
273#define CEPH_OSD_TMAP_RM 'r'
274
275extern const char *ceph_osd_op_name(int op);
276
277
278/*
279 * osd op flags
280 *
281 * An op may be READ, WRITE, or READ|WRITE.
282 */
283enum {
284 CEPH_OSD_FLAG_ACK = 1, /* want (or is) "ack" ack */
285 CEPH_OSD_FLAG_ONNVRAM = 2, /* want (or is) "onnvram" ack */
286 CEPH_OSD_FLAG_ONDISK = 4, /* want (or is) "ondisk" ack */
287 CEPH_OSD_FLAG_RETRY = 8, /* resend attempt */
288 CEPH_OSD_FLAG_READ = 16, /* op may read */
289 CEPH_OSD_FLAG_WRITE = 32, /* op may write */
290 CEPH_OSD_FLAG_ORDERSNAP = 64, /* EOLDSNAP if snapc is out of order */
291 CEPH_OSD_FLAG_PEERSTAT = 128, /* msg includes osd_peer_stat */
292 CEPH_OSD_FLAG_BALANCE_READS = 256,
293 CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */
294 CEPH_OSD_FLAG_PGOP = 1024, /* pg op, no object */
295 CEPH_OSD_FLAG_EXEC = 2048, /* op may exec */
296};
297
298enum {
299 CEPH_OSD_OP_FLAG_EXCL = 1, /* EXCL object create */
300};
301
302#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/
303#define EBLACKLISTED ESHUTDOWN /* blacklisted */
304
305/*
306 * an individual object operation. each may be accompanied by some data
307 * payload
308 */
309struct ceph_osd_op {
310 __le16 op; /* CEPH_OSD_OP_* */
311 __le32 flags; /* CEPH_OSD_FLAG_* */
312 union {
313 struct {
314 __le64 offset, length;
315 __le64 truncate_size;
316 __le32 truncate_seq;
317 } __attribute__ ((packed)) extent;
318 struct {
319 __le32 name_len;
320 __le32 value_len;
321 } __attribute__ ((packed)) xattr;
322 struct {
323 __u8 class_len;
324 __u8 method_len;
325 __u8 argc;
326 __le32 indata_len;
327 } __attribute__ ((packed)) cls;
328 struct {
329 __le64 cookie, count;
330 } __attribute__ ((packed)) pgls;
331 };
332 __le32 payload_len;
333} __attribute__ ((packed));
334
335/*
336 * osd request message header. each request may include multiple
337 * ceph_osd_op object operations.
338 */
339struct ceph_osd_request_head {
340 __le32 client_inc; /* client incarnation */
341 struct ceph_object_layout layout; /* pgid */
342 __le32 osdmap_epoch; /* client's osdmap epoch */
343
344 __le32 flags;
345
346 struct ceph_timespec mtime; /* for mutations only */
347 struct ceph_eversion reassert_version; /* if we are replaying op */
348
349 __le32 object_len; /* length of object name */
350
351 __le64 snapid; /* snapid to read */
352 __le64 snap_seq; /* writer's snap context */
353 __le32 num_snaps;
354
355 __le16 num_ops;
356 struct ceph_osd_op ops[]; /* followed by ops[], obj, ticket, snaps */
357} __attribute__ ((packed));
358
359struct ceph_osd_reply_head {
360 __le32 client_inc; /* client incarnation */
361 __le32 flags;
362 struct ceph_object_layout layout;
363 __le32 osdmap_epoch;
364 struct ceph_eversion reassert_version; /* for replaying uncommitted */
365
366 __le32 result; /* result code */
367
368 __le32 object_len; /* length of object name */
369 __le32 num_ops;
370 struct ceph_osd_op ops[0]; /* ops[], object */
371} __attribute__ ((packed));
372
373
374#endif
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
new file mode 100644
index 000000000000..bf2a5f3846a4
--- /dev/null
+++ b/fs/ceph/snap.c
@@ -0,0 +1,904 @@
1#include "ceph_debug.h"
2
3#include <linux/sort.h>
4
5#include "super.h"
6#include "decode.h"
7
8/*
9 * Snapshots in ceph are driven in large part by cooperation from the
10 * client. In contrast to local file systems or file servers that
11 * implement snapshots at a single point in the system, ceph's
12 * distributed access to storage requires clients to help decide
13 * whether a write logically occurs before or after a recently created
14 * snapshot.
15 *
16 * This provides a perfect instantanous client-wide snapshot. Between
17 * clients, however, snapshots may appear to be applied at slightly
18 * different points in time, depending on delays in delivering the
19 * snapshot notification.
20 *
21 * Snapshots are _not_ file system-wide. Instead, each snapshot
22 * applies to the subdirectory nested beneath some directory. This
23 * effectively divides the hierarchy into multiple "realms," where all
24 * of the files contained by each realm share the same set of
25 * snapshots. An individual realm's snap set contains snapshots
26 * explicitly created on that realm, as well as any snaps in its
27 * parent's snap set _after_ the point at which the parent became it's
28 * parent (due to, say, a rename). Similarly, snaps from prior parents
29 * during the time intervals during which they were the parent are included.
30 *
31 * The client is spared most of this detail, fortunately... it must only
32 * maintains a hierarchy of realms reflecting the current parent/child
33 * realm relationship, and for each realm has an explicit list of snaps
34 * inherited from prior parents.
35 *
36 * A snap_realm struct is maintained for realms containing every inode
37 * with an open cap in the system. (The needed snap realm information is
38 * provided by the MDS whenever a cap is issued, i.e., on open.) A 'seq'
39 * version number is used to ensure that as realm parameters change (new
40 * snapshot, new parent, etc.) the client's realm hierarchy is updated.
41 *
42 * The realm hierarchy drives the generation of a 'snap context' for each
43 * realm, which simply lists the resulting set of snaps for the realm. This
44 * is attached to any writes sent to OSDs.
45 */
46/*
47 * Unfortunately error handling is a bit mixed here. If we get a snap
48 * update, but don't have enough memory to update our realm hierarchy,
49 * it's not clear what we can do about it (besides complaining to the
50 * console).
51 */
52
53
54/*
55 * increase ref count for the realm
56 *
57 * caller must hold snap_rwsem for write.
58 */
59void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
60 struct ceph_snap_realm *realm)
61{
62 dout("get_realm %p %d -> %d\n", realm,
63 atomic_read(&realm->nref), atomic_read(&realm->nref)+1);
64 /*
65 * since we _only_ increment realm refs or empty the empty
66 * list with snap_rwsem held, adjusting the empty list here is
67 * safe. we do need to protect against concurrent empty list
68 * additions, however.
69 */
70 if (atomic_read(&realm->nref) == 0) {
71 spin_lock(&mdsc->snap_empty_lock);
72 list_del_init(&realm->empty_item);
73 spin_unlock(&mdsc->snap_empty_lock);
74 }
75
76 atomic_inc(&realm->nref);
77}
78
79static void __insert_snap_realm(struct rb_root *root,
80 struct ceph_snap_realm *new)
81{
82 struct rb_node **p = &root->rb_node;
83 struct rb_node *parent = NULL;
84 struct ceph_snap_realm *r = NULL;
85
86 while (*p) {
87 parent = *p;
88 r = rb_entry(parent, struct ceph_snap_realm, node);
89 if (new->ino < r->ino)
90 p = &(*p)->rb_left;
91 else if (new->ino > r->ino)
92 p = &(*p)->rb_right;
93 else
94 BUG();
95 }
96
97 rb_link_node(&new->node, parent, p);
98 rb_insert_color(&new->node, root);
99}
100
101/*
102 * create and get the realm rooted at @ino and bump its ref count.
103 *
104 * caller must hold snap_rwsem for write.
105 */
106static struct ceph_snap_realm *ceph_create_snap_realm(
107 struct ceph_mds_client *mdsc,
108 u64 ino)
109{
110 struct ceph_snap_realm *realm;
111
112 realm = kzalloc(sizeof(*realm), GFP_NOFS);
113 if (!realm)
114 return ERR_PTR(-ENOMEM);
115
116 atomic_set(&realm->nref, 0); /* tree does not take a ref */
117 realm->ino = ino;
118 INIT_LIST_HEAD(&realm->children);
119 INIT_LIST_HEAD(&realm->child_item);
120 INIT_LIST_HEAD(&realm->empty_item);
121 INIT_LIST_HEAD(&realm->inodes_with_caps);
122 spin_lock_init(&realm->inodes_with_caps_lock);
123 __insert_snap_realm(&mdsc->snap_realms, realm);
124 dout("create_snap_realm %llx %p\n", realm->ino, realm);
125 return realm;
126}
127
128/*
129 * lookup the realm rooted at @ino.
130 *
131 * caller must hold snap_rwsem for write.
132 */
133struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
134 u64 ino)
135{
136 struct rb_node *n = mdsc->snap_realms.rb_node;
137 struct ceph_snap_realm *r;
138
139 while (n) {
140 r = rb_entry(n, struct ceph_snap_realm, node);
141 if (ino < r->ino)
142 n = n->rb_left;
143 else if (ino > r->ino)
144 n = n->rb_right;
145 else {
146 dout("lookup_snap_realm %llx %p\n", r->ino, r);
147 return r;
148 }
149 }
150 return NULL;
151}
152
153static void __put_snap_realm(struct ceph_mds_client *mdsc,
154 struct ceph_snap_realm *realm);
155
156/*
157 * called with snap_rwsem (write)
158 */
159static void __destroy_snap_realm(struct ceph_mds_client *mdsc,
160 struct ceph_snap_realm *realm)
161{
162 dout("__destroy_snap_realm %p %llx\n", realm, realm->ino);
163
164 rb_erase(&realm->node, &mdsc->snap_realms);
165
166 if (realm->parent) {
167 list_del_init(&realm->child_item);
168 __put_snap_realm(mdsc, realm->parent);
169 }
170
171 kfree(realm->prior_parent_snaps);
172 kfree(realm->snaps);
173 ceph_put_snap_context(realm->cached_context);
174 kfree(realm);
175}
176
177/*
178 * caller holds snap_rwsem (write)
179 */
180static void __put_snap_realm(struct ceph_mds_client *mdsc,
181 struct ceph_snap_realm *realm)
182{
183 dout("__put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
184 atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
185 if (atomic_dec_and_test(&realm->nref))
186 __destroy_snap_realm(mdsc, realm);
187}
188
189/*
190 * caller needn't hold any locks
191 */
192void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
193 struct ceph_snap_realm *realm)
194{
195 dout("put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
196 atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
197 if (!atomic_dec_and_test(&realm->nref))
198 return;
199
200 if (down_write_trylock(&mdsc->snap_rwsem)) {
201 __destroy_snap_realm(mdsc, realm);
202 up_write(&mdsc->snap_rwsem);
203 } else {
204 spin_lock(&mdsc->snap_empty_lock);
205 list_add(&mdsc->snap_empty, &realm->empty_item);
206 spin_unlock(&mdsc->snap_empty_lock);
207 }
208}
209
210/*
211 * Clean up any realms whose ref counts have dropped to zero. Note
212 * that this does not include realms who were created but not yet
213 * used.
214 *
215 * Called under snap_rwsem (write)
216 */
217static void __cleanup_empty_realms(struct ceph_mds_client *mdsc)
218{
219 struct ceph_snap_realm *realm;
220
221 spin_lock(&mdsc->snap_empty_lock);
222 while (!list_empty(&mdsc->snap_empty)) {
223 realm = list_first_entry(&mdsc->snap_empty,
224 struct ceph_snap_realm, empty_item);
225 list_del(&realm->empty_item);
226 spin_unlock(&mdsc->snap_empty_lock);
227 __destroy_snap_realm(mdsc, realm);
228 spin_lock(&mdsc->snap_empty_lock);
229 }
230 spin_unlock(&mdsc->snap_empty_lock);
231}
232
233void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc)
234{
235 down_write(&mdsc->snap_rwsem);
236 __cleanup_empty_realms(mdsc);
237 up_write(&mdsc->snap_rwsem);
238}
239
240/*
241 * adjust the parent realm of a given @realm. adjust child list, and parent
242 * pointers, and ref counts appropriately.
243 *
244 * return true if parent was changed, 0 if unchanged, <0 on error.
245 *
246 * caller must hold snap_rwsem for write.
247 */
248static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc,
249 struct ceph_snap_realm *realm,
250 u64 parentino)
251{
252 struct ceph_snap_realm *parent;
253
254 if (realm->parent_ino == parentino)
255 return 0;
256
257 parent = ceph_lookup_snap_realm(mdsc, parentino);
258 if (!parent) {
259 parent = ceph_create_snap_realm(mdsc, parentino);
260 if (IS_ERR(parent))
261 return PTR_ERR(parent);
262 }
263 dout("adjust_snap_realm_parent %llx %p: %llx %p -> %llx %p\n",
264 realm->ino, realm, realm->parent_ino, realm->parent,
265 parentino, parent);
266 if (realm->parent) {
267 list_del_init(&realm->child_item);
268 ceph_put_snap_realm(mdsc, realm->parent);
269 }
270 realm->parent_ino = parentino;
271 realm->parent = parent;
272 ceph_get_snap_realm(mdsc, parent);
273 list_add(&realm->child_item, &parent->children);
274 return 1;
275}
276
277
278static int cmpu64_rev(const void *a, const void *b)
279{
280 if (*(u64 *)a < *(u64 *)b)
281 return 1;
282 if (*(u64 *)a > *(u64 *)b)
283 return -1;
284 return 0;
285}
286
287/*
288 * build the snap context for a given realm.
289 */
290static int build_snap_context(struct ceph_snap_realm *realm)
291{
292 struct ceph_snap_realm *parent = realm->parent;
293 struct ceph_snap_context *snapc;
294 int err = 0;
295 int i;
296 int num = realm->num_prior_parent_snaps + realm->num_snaps;
297
298 /*
299 * build parent context, if it hasn't been built.
300 * conservatively estimate that all parent snaps might be
301 * included by us.
302 */
303 if (parent) {
304 if (!parent->cached_context) {
305 err = build_snap_context(parent);
306 if (err)
307 goto fail;
308 }
309 num += parent->cached_context->num_snaps;
310 }
311
312 /* do i actually need to update? not if my context seq
313 matches realm seq, and my parents' does to. (this works
314 because we rebuild_snap_realms() works _downward_ in
315 hierarchy after each update.) */
316 if (realm->cached_context &&
317 realm->cached_context->seq <= realm->seq &&
318 (!parent ||
319 realm->cached_context->seq <= parent->cached_context->seq)) {
320 dout("build_snap_context %llx %p: %p seq %lld (%d snaps)"
321 " (unchanged)\n",
322 realm->ino, realm, realm->cached_context,
323 realm->cached_context->seq,
324 realm->cached_context->num_snaps);
325 return 0;
326 }
327
328 /* alloc new snap context */
329 err = -ENOMEM;
330 if (num > ULONG_MAX / sizeof(u64) - sizeof(*snapc))
331 goto fail;
332 snapc = kzalloc(sizeof(*snapc) + num*sizeof(u64), GFP_NOFS);
333 if (!snapc)
334 goto fail;
335 atomic_set(&snapc->nref, 1);
336
337 /* build (reverse sorted) snap vector */
338 num = 0;
339 snapc->seq = realm->seq;
340 if (parent) {
341 /* include any of parent's snaps occuring _after_ my
342 parent became my parent */
343 for (i = 0; i < parent->cached_context->num_snaps; i++)
344 if (parent->cached_context->snaps[i] >=
345 realm->parent_since)
346 snapc->snaps[num++] =
347 parent->cached_context->snaps[i];
348 if (parent->cached_context->seq > snapc->seq)
349 snapc->seq = parent->cached_context->seq;
350 }
351 memcpy(snapc->snaps + num, realm->snaps,
352 sizeof(u64)*realm->num_snaps);
353 num += realm->num_snaps;
354 memcpy(snapc->snaps + num, realm->prior_parent_snaps,
355 sizeof(u64)*realm->num_prior_parent_snaps);
356 num += realm->num_prior_parent_snaps;
357
358 sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL);
359 snapc->num_snaps = num;
360 dout("build_snap_context %llx %p: %p seq %lld (%d snaps)\n",
361 realm->ino, realm, snapc, snapc->seq, snapc->num_snaps);
362
363 if (realm->cached_context)
364 ceph_put_snap_context(realm->cached_context);
365 realm->cached_context = snapc;
366 return 0;
367
368fail:
369 /*
370 * if we fail, clear old (incorrect) cached_context... hopefully
371 * we'll have better luck building it later
372 */
373 if (realm->cached_context) {
374 ceph_put_snap_context(realm->cached_context);
375 realm->cached_context = NULL;
376 }
377 pr_err("build_snap_context %llx %p fail %d\n", realm->ino,
378 realm, err);
379 return err;
380}
381
382/*
383 * rebuild snap context for the given realm and all of its children.
384 */
385static void rebuild_snap_realms(struct ceph_snap_realm *realm)
386{
387 struct ceph_snap_realm *child;
388
389 dout("rebuild_snap_realms %llx %p\n", realm->ino, realm);
390 build_snap_context(realm);
391
392 list_for_each_entry(child, &realm->children, child_item)
393 rebuild_snap_realms(child);
394}
395
396
397/*
398 * helper to allocate and decode an array of snapids. free prior
399 * instance, if any.
400 */
401static int dup_array(u64 **dst, __le64 *src, int num)
402{
403 int i;
404
405 kfree(*dst);
406 if (num) {
407 *dst = kcalloc(num, sizeof(u64), GFP_NOFS);
408 if (!*dst)
409 return -ENOMEM;
410 for (i = 0; i < num; i++)
411 (*dst)[i] = get_unaligned_le64(src + i);
412 } else {
413 *dst = NULL;
414 }
415 return 0;
416}
417
418
419/*
420 * When a snapshot is applied, the size/mtime inode metadata is queued
421 * in a ceph_cap_snap (one for each snapshot) until writeback
422 * completes and the metadata can be flushed back to the MDS.
423 *
424 * However, if a (sync) write is currently in-progress when we apply
425 * the snapshot, we have to wait until the write succeeds or fails
426 * (and a final size/mtime is known). In this case the
427 * cap_snap->writing = 1, and is said to be "pending." When the write
428 * finishes, we __ceph_finish_cap_snap().
429 *
430 * Caller must hold snap_rwsem for read (i.e., the realm topology won't
431 * change).
432 */
433void ceph_queue_cap_snap(struct ceph_inode_info *ci,
434 struct ceph_snap_context *snapc)
435{
436 struct inode *inode = &ci->vfs_inode;
437 struct ceph_cap_snap *capsnap;
438 int used;
439
440 capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS);
441 if (!capsnap) {
442 pr_err("ENOMEM allocating ceph_cap_snap on %p\n", inode);
443 return;
444 }
445
446 spin_lock(&inode->i_lock);
447 used = __ceph_caps_used(ci);
448 if (__ceph_have_pending_cap_snap(ci)) {
449 /* there is no point in queuing multiple "pending" cap_snaps,
450 as no new writes are allowed to start when pending, so any
451 writes in progress now were started before the previous
452 cap_snap. lucky us. */
453 dout("queue_cap_snap %p snapc %p seq %llu used %d"
454 " already pending\n", inode, snapc, snapc->seq, used);
455 kfree(capsnap);
456 } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR)) {
457 igrab(inode);
458
459 atomic_set(&capsnap->nref, 1);
460 capsnap->ci = ci;
461 INIT_LIST_HEAD(&capsnap->ci_item);
462 INIT_LIST_HEAD(&capsnap->flushing_item);
463
464 capsnap->follows = snapc->seq - 1;
465 capsnap->context = ceph_get_snap_context(snapc);
466 capsnap->issued = __ceph_caps_issued(ci, NULL);
467 capsnap->dirty = __ceph_caps_dirty(ci);
468
469 capsnap->mode = inode->i_mode;
470 capsnap->uid = inode->i_uid;
471 capsnap->gid = inode->i_gid;
472
473 /* fixme? */
474 capsnap->xattr_blob = NULL;
475 capsnap->xattr_len = 0;
476
477 /* dirty page count moved from _head to this cap_snap;
478 all subsequent writes page dirties occur _after_ this
479 snapshot. */
480 capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
481 ci->i_wrbuffer_ref_head = 0;
482 ceph_put_snap_context(ci->i_head_snapc);
483 ci->i_head_snapc = NULL;
484 list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
485
486 if (used & CEPH_CAP_FILE_WR) {
487 dout("queue_cap_snap %p cap_snap %p snapc %p"
488 " seq %llu used WR, now pending\n", inode,
489 capsnap, snapc, snapc->seq);
490 capsnap->writing = 1;
491 } else {
492 /* note mtime, size NOW. */
493 __ceph_finish_cap_snap(ci, capsnap);
494 }
495 } else {
496 dout("queue_cap_snap %p nothing dirty|writing\n", inode);
497 kfree(capsnap);
498 }
499
500 spin_unlock(&inode->i_lock);
501}
502
503/*
504 * Finalize the size, mtime for a cap_snap.. that is, settle on final values
505 * to be used for the snapshot, to be flushed back to the mds.
506 *
507 * If capsnap can now be flushed, add to snap_flush list, and return 1.
508 *
509 * Caller must hold i_lock.
510 */
511int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
512 struct ceph_cap_snap *capsnap)
513{
514 struct inode *inode = &ci->vfs_inode;
515 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
516
517 BUG_ON(capsnap->writing);
518 capsnap->size = inode->i_size;
519 capsnap->mtime = inode->i_mtime;
520 capsnap->atime = inode->i_atime;
521 capsnap->ctime = inode->i_ctime;
522 capsnap->time_warp_seq = ci->i_time_warp_seq;
523 if (capsnap->dirty_pages) {
524 dout("finish_cap_snap %p cap_snap %p snapc %p %llu s=%llu "
525 "still has %d dirty pages\n", inode, capsnap,
526 capsnap->context, capsnap->context->seq,
527 capsnap->size, capsnap->dirty_pages);
528 return 0;
529 }
530 dout("finish_cap_snap %p cap_snap %p snapc %p %llu s=%llu clean\n",
531 inode, capsnap, capsnap->context,
532 capsnap->context->seq, capsnap->size);
533
534 spin_lock(&mdsc->snap_flush_lock);
535 list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list);
536 spin_unlock(&mdsc->snap_flush_lock);
537 return 1; /* caller may want to ceph_flush_snaps */
538}
539
540
541/*
542 * Parse and apply a snapblob "snap trace" from the MDS. This specifies
543 * the snap realm parameters from a given realm and all of its ancestors,
544 * up to the root.
545 *
546 * Caller must hold snap_rwsem for write.
547 */
548int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
549 void *p, void *e, bool deletion)
550{
551 struct ceph_mds_snap_realm *ri; /* encoded */
552 __le64 *snaps; /* encoded */
553 __le64 *prior_parent_snaps; /* encoded */
554 struct ceph_snap_realm *realm;
555 int invalidate = 0;
556 int err = -ENOMEM;
557
558 dout("update_snap_trace deletion=%d\n", deletion);
559more:
560 ceph_decode_need(&p, e, sizeof(*ri), bad);
561 ri = p;
562 p += sizeof(*ri);
563 ceph_decode_need(&p, e, sizeof(u64)*(le32_to_cpu(ri->num_snaps) +
564 le32_to_cpu(ri->num_prior_parent_snaps)), bad);
565 snaps = p;
566 p += sizeof(u64) * le32_to_cpu(ri->num_snaps);
567 prior_parent_snaps = p;
568 p += sizeof(u64) * le32_to_cpu(ri->num_prior_parent_snaps);
569
570 realm = ceph_lookup_snap_realm(mdsc, le64_to_cpu(ri->ino));
571 if (!realm) {
572 realm = ceph_create_snap_realm(mdsc, le64_to_cpu(ri->ino));
573 if (IS_ERR(realm)) {
574 err = PTR_ERR(realm);
575 goto fail;
576 }
577 }
578
579 if (le64_to_cpu(ri->seq) > realm->seq) {
580 dout("update_snap_trace updating %llx %p %lld -> %lld\n",
581 realm->ino, realm, realm->seq, le64_to_cpu(ri->seq));
582 /*
583 * if the realm seq has changed, queue a cap_snap for every
584 * inode with open caps. we do this _before_ we update
585 * the realm info so that we prepare for writeback under the
586 * _previous_ snap context.
587 *
588 * ...unless it's a snap deletion!
589 */
590 if (!deletion) {
591 struct ceph_inode_info *ci;
592 struct inode *lastinode = NULL;
593
594 spin_lock(&realm->inodes_with_caps_lock);
595 list_for_each_entry(ci, &realm->inodes_with_caps,
596 i_snap_realm_item) {
597 struct inode *inode = igrab(&ci->vfs_inode);
598 if (!inode)
599 continue;
600 spin_unlock(&realm->inodes_with_caps_lock);
601 if (lastinode)
602 iput(lastinode);
603 lastinode = inode;
604 ceph_queue_cap_snap(ci, realm->cached_context);
605 spin_lock(&realm->inodes_with_caps_lock);
606 }
607 spin_unlock(&realm->inodes_with_caps_lock);
608 if (lastinode)
609 iput(lastinode);
610 dout("update_snap_trace cap_snaps queued\n");
611 }
612
613 } else {
614 dout("update_snap_trace %llx %p seq %lld unchanged\n",
615 realm->ino, realm, realm->seq);
616 }
617
618 /* ensure the parent is correct */
619 err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent));
620 if (err < 0)
621 goto fail;
622 invalidate += err;
623
624 if (le64_to_cpu(ri->seq) > realm->seq) {
625 /* update realm parameters, snap lists */
626 realm->seq = le64_to_cpu(ri->seq);
627 realm->created = le64_to_cpu(ri->created);
628 realm->parent_since = le64_to_cpu(ri->parent_since);
629
630 realm->num_snaps = le32_to_cpu(ri->num_snaps);
631 err = dup_array(&realm->snaps, snaps, realm->num_snaps);
632 if (err < 0)
633 goto fail;
634
635 realm->num_prior_parent_snaps =
636 le32_to_cpu(ri->num_prior_parent_snaps);
637 err = dup_array(&realm->prior_parent_snaps, prior_parent_snaps,
638 realm->num_prior_parent_snaps);
639 if (err < 0)
640 goto fail;
641
642 invalidate = 1;
643 } else if (!realm->cached_context) {
644 invalidate = 1;
645 }
646
647 dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino,
648 realm, invalidate, p, e);
649
650 if (p < e)
651 goto more;
652
653 /* invalidate when we reach the _end_ (root) of the trace */
654 if (invalidate)
655 rebuild_snap_realms(realm);
656
657 __cleanup_empty_realms(mdsc);
658 return 0;
659
660bad:
661 err = -EINVAL;
662fail:
663 pr_err("update_snap_trace error %d\n", err);
664 return err;
665}
666
667
668/*
669 * Send any cap_snaps that are queued for flush. Try to carry
670 * s_mutex across multiple snap flushes to avoid locking overhead.
671 *
672 * Caller holds no locks.
673 */
674static void flush_snaps(struct ceph_mds_client *mdsc)
675{
676 struct ceph_inode_info *ci;
677 struct inode *inode;
678 struct ceph_mds_session *session = NULL;
679
680 dout("flush_snaps\n");
681 spin_lock(&mdsc->snap_flush_lock);
682 while (!list_empty(&mdsc->snap_flush_list)) {
683 ci = list_first_entry(&mdsc->snap_flush_list,
684 struct ceph_inode_info, i_snap_flush_item);
685 inode = &ci->vfs_inode;
686 igrab(inode);
687 spin_unlock(&mdsc->snap_flush_lock);
688 spin_lock(&inode->i_lock);
689 __ceph_flush_snaps(ci, &session);
690 spin_unlock(&inode->i_lock);
691 iput(inode);
692 spin_lock(&mdsc->snap_flush_lock);
693 }
694 spin_unlock(&mdsc->snap_flush_lock);
695
696 if (session) {
697 mutex_unlock(&session->s_mutex);
698 ceph_put_mds_session(session);
699 }
700 dout("flush_snaps done\n");
701}
702
703
704/*
705 * Handle a snap notification from the MDS.
706 *
707 * This can take two basic forms: the simplest is just a snap creation
708 * or deletion notification on an existing realm. This should update the
709 * realm and its children.
710 *
711 * The more difficult case is realm creation, due to snap creation at a
712 * new point in the file hierarchy, or due to a rename that moves a file or
713 * directory into another realm.
714 */
715void ceph_handle_snap(struct ceph_mds_client *mdsc,
716 struct ceph_mds_session *session,
717 struct ceph_msg *msg)
718{
719 struct super_block *sb = mdsc->client->sb;
720 int mds = session->s_mds;
721 u64 split;
722 int op;
723 int trace_len;
724 struct ceph_snap_realm *realm = NULL;
725 void *p = msg->front.iov_base;
726 void *e = p + msg->front.iov_len;
727 struct ceph_mds_snap_head *h;
728 int num_split_inos, num_split_realms;
729 __le64 *split_inos = NULL, *split_realms = NULL;
730 int i;
731 int locked_rwsem = 0;
732
733 /* decode */
734 if (msg->front.iov_len < sizeof(*h))
735 goto bad;
736 h = p;
737 op = le32_to_cpu(h->op);
738 split = le64_to_cpu(h->split); /* non-zero if we are splitting an
739 * existing realm */
740 num_split_inos = le32_to_cpu(h->num_split_inos);
741 num_split_realms = le32_to_cpu(h->num_split_realms);
742 trace_len = le32_to_cpu(h->trace_len);
743 p += sizeof(*h);
744
745 dout("handle_snap from mds%d op %s split %llx tracelen %d\n", mds,
746 ceph_snap_op_name(op), split, trace_len);
747
748 mutex_lock(&session->s_mutex);
749 session->s_seq++;
750 mutex_unlock(&session->s_mutex);
751
752 down_write(&mdsc->snap_rwsem);
753 locked_rwsem = 1;
754
755 if (op == CEPH_SNAP_OP_SPLIT) {
756 struct ceph_mds_snap_realm *ri;
757
758 /*
759 * A "split" breaks part of an existing realm off into
760 * a new realm. The MDS provides a list of inodes
761 * (with caps) and child realms that belong to the new
762 * child.
763 */
764 split_inos = p;
765 p += sizeof(u64) * num_split_inos;
766 split_realms = p;
767 p += sizeof(u64) * num_split_realms;
768 ceph_decode_need(&p, e, sizeof(*ri), bad);
769 /* we will peek at realm info here, but will _not_
770 * advance p, as the realm update will occur below in
771 * ceph_update_snap_trace. */
772 ri = p;
773
774 realm = ceph_lookup_snap_realm(mdsc, split);
775 if (!realm) {
776 realm = ceph_create_snap_realm(mdsc, split);
777 if (IS_ERR(realm))
778 goto out;
779 }
780 ceph_get_snap_realm(mdsc, realm);
781
782 dout("splitting snap_realm %llx %p\n", realm->ino, realm);
783 for (i = 0; i < num_split_inos; i++) {
784 struct ceph_vino vino = {
785 .ino = le64_to_cpu(split_inos[i]),
786 .snap = CEPH_NOSNAP,
787 };
788 struct inode *inode = ceph_find_inode(sb, vino);
789 struct ceph_inode_info *ci;
790
791 if (!inode)
792 continue;
793 ci = ceph_inode(inode);
794
795 spin_lock(&inode->i_lock);
796 if (!ci->i_snap_realm)
797 goto skip_inode;
798 /*
799 * If this inode belongs to a realm that was
800 * created after our new realm, we experienced
801 * a race (due to another split notifications
802 * arriving from a different MDS). So skip
803 * this inode.
804 */
805 if (ci->i_snap_realm->created >
806 le64_to_cpu(ri->created)) {
807 dout(" leaving %p in newer realm %llx %p\n",
808 inode, ci->i_snap_realm->ino,
809 ci->i_snap_realm);
810 goto skip_inode;
811 }
812 dout(" will move %p to split realm %llx %p\n",
813 inode, realm->ino, realm);
814 /*
815 * Remove the inode from the realm's inode
816 * list, but don't add it to the new realm
817 * yet. We don't want the cap_snap to be
818 * queued (again) by ceph_update_snap_trace()
819 * below. Queue it _now_, under the old context.
820 */
821 list_del_init(&ci->i_snap_realm_item);
822 spin_unlock(&inode->i_lock);
823
824 ceph_queue_cap_snap(ci,
825 ci->i_snap_realm->cached_context);
826
827 iput(inode);
828 continue;
829
830skip_inode:
831 spin_unlock(&inode->i_lock);
832 iput(inode);
833 }
834
835 /* we may have taken some of the old realm's children. */
836 for (i = 0; i < num_split_realms; i++) {
837 struct ceph_snap_realm *child =
838 ceph_lookup_snap_realm(mdsc,
839 le64_to_cpu(split_realms[i]));
840 if (!child)
841 continue;
842 adjust_snap_realm_parent(mdsc, child, realm->ino);
843 }
844 }
845
846 /*
847 * update using the provided snap trace. if we are deleting a
848 * snap, we can avoid queueing cap_snaps.
849 */
850 ceph_update_snap_trace(mdsc, p, e,
851 op == CEPH_SNAP_OP_DESTROY);
852
853 if (op == CEPH_SNAP_OP_SPLIT) {
854 /*
855 * ok, _now_ add the inodes into the new realm.
856 */
857 for (i = 0; i < num_split_inos; i++) {
858 struct ceph_vino vino = {
859 .ino = le64_to_cpu(split_inos[i]),
860 .snap = CEPH_NOSNAP,
861 };
862 struct inode *inode = ceph_find_inode(sb, vino);
863 struct ceph_inode_info *ci;
864
865 if (!inode)
866 continue;
867 ci = ceph_inode(inode);
868 spin_lock(&inode->i_lock);
869 if (!ci->i_snap_realm)
870 goto split_skip_inode;
871 ceph_put_snap_realm(mdsc, ci->i_snap_realm);
872 spin_lock(&realm->inodes_with_caps_lock);
873 list_add(&ci->i_snap_realm_item,
874 &realm->inodes_with_caps);
875 ci->i_snap_realm = realm;
876 spin_unlock(&realm->inodes_with_caps_lock);
877 ceph_get_snap_realm(mdsc, realm);
878split_skip_inode:
879 spin_unlock(&inode->i_lock);
880 iput(inode);
881 }
882
883 /* we took a reference when we created the realm, above */
884 ceph_put_snap_realm(mdsc, realm);
885 }
886
887 __cleanup_empty_realms(mdsc);
888
889 up_write(&mdsc->snap_rwsem);
890
891 flush_snaps(mdsc);
892 return;
893
894bad:
895 pr_err("corrupt snap message from mds%d\n", mds);
896 ceph_msg_dump(msg);
897out:
898 if (locked_rwsem)
899 up_write(&mdsc->snap_rwsem);
900 return;
901}
902
903
904
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
new file mode 100644
index 000000000000..4290a6e860b0
--- /dev/null
+++ b/fs/ceph/super.c
@@ -0,0 +1,1030 @@
1
2#include "ceph_debug.h"
3
4#include <linux/backing-dev.h>
5#include <linux/fs.h>
6#include <linux/inet.h>
7#include <linux/in6.h>
8#include <linux/module.h>
9#include <linux/mount.h>
10#include <linux/parser.h>
11#include <linux/rwsem.h>
12#include <linux/sched.h>
13#include <linux/seq_file.h>
14#include <linux/statfs.h>
15#include <linux/string.h>
16#include <linux/version.h>
17#include <linux/vmalloc.h>
18
19#include "decode.h"
20#include "super.h"
21#include "mon_client.h"
22#include "auth.h"
23
24/*
25 * Ceph superblock operations
26 *
27 * Handle the basics of mounting, unmounting.
28 */
29
30
31/*
32 * find filename portion of a path (/foo/bar/baz -> baz)
33 */
34const char *ceph_file_part(const char *s, int len)
35{
36 const char *e = s + len;
37
38 while (e != s && *(e-1) != '/')
39 e--;
40 return e;
41}
42
43
44/*
45 * super ops
46 */
47static void ceph_put_super(struct super_block *s)
48{
49 struct ceph_client *cl = ceph_client(s);
50
51 dout("put_super\n");
52 ceph_mdsc_close_sessions(&cl->mdsc);
53 return;
54}
55
56static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
57{
58 struct ceph_client *client = ceph_inode_to_client(dentry->d_inode);
59 struct ceph_monmap *monmap = client->monc.monmap;
60 struct ceph_statfs st;
61 u64 fsid;
62 int err;
63
64 dout("statfs\n");
65 err = ceph_monc_do_statfs(&client->monc, &st);
66 if (err < 0)
67 return err;
68
69 /* fill in kstatfs */
70 buf->f_type = CEPH_SUPER_MAGIC; /* ?? */
71
72 /*
73 * express utilization in terms of large blocks to avoid
74 * overflow on 32-bit machines.
75 */
76 buf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
77 buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10);
78 buf->f_bfree = (le64_to_cpu(st.kb) - le64_to_cpu(st.kb_used)) >>
79 (CEPH_BLOCK_SHIFT-10);
80 buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
81
82 buf->f_files = le64_to_cpu(st.num_objects);
83 buf->f_ffree = -1;
84 buf->f_namelen = PATH_MAX;
85 buf->f_frsize = PAGE_CACHE_SIZE;
86
87 /* leave fsid little-endian, regardless of host endianness */
88 fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1);
89 buf->f_fsid.val[0] = fsid & 0xffffffff;
90 buf->f_fsid.val[1] = fsid >> 32;
91
92 return 0;
93}
94
95
96static int ceph_syncfs(struct super_block *sb, int wait)
97{
98 dout("sync_fs %d\n", wait);
99 ceph_osdc_sync(&ceph_client(sb)->osdc);
100 ceph_mdsc_sync(&ceph_client(sb)->mdsc);
101 dout("sync_fs %d done\n", wait);
102 return 0;
103}
104
105
106/**
107 * ceph_show_options - Show mount options in /proc/mounts
108 * @m: seq_file to write to
109 * @mnt: mount descriptor
110 */
111static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
112{
113 struct ceph_client *client = ceph_sb_to_client(mnt->mnt_sb);
114 struct ceph_mount_args *args = client->mount_args;
115
116 if (args->flags & CEPH_OPT_FSID)
117 seq_printf(m, ",fsidmajor=%llu,fsidminor%llu",
118 le64_to_cpu(*(__le64 *)&args->fsid.fsid[0]),
119 le64_to_cpu(*(__le64 *)&args->fsid.fsid[8]));
120 if (args->flags & CEPH_OPT_NOSHARE)
121 seq_puts(m, ",noshare");
122 if (args->flags & CEPH_OPT_DIRSTAT)
123 seq_puts(m, ",dirstat");
124 if ((args->flags & CEPH_OPT_RBYTES) == 0)
125 seq_puts(m, ",norbytes");
126 if (args->flags & CEPH_OPT_NOCRC)
127 seq_puts(m, ",nocrc");
128 if (args->flags & CEPH_OPT_NOASYNCREADDIR)
129 seq_puts(m, ",noasyncreaddir");
130 if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
131 seq_printf(m, ",snapdirname=%s", args->snapdir_name);
132 if (args->name)
133 seq_printf(m, ",name=%s", args->name);
134 if (args->secret)
135 seq_puts(m, ",secret=<hidden>");
136 return 0;
137}
138
139/*
140 * caches
141 */
142struct kmem_cache *ceph_inode_cachep;
143struct kmem_cache *ceph_cap_cachep;
144struct kmem_cache *ceph_dentry_cachep;
145struct kmem_cache *ceph_file_cachep;
146
147static void ceph_inode_init_once(void *foo)
148{
149 struct ceph_inode_info *ci = foo;
150 inode_init_once(&ci->vfs_inode);
151}
152
153static int default_congestion_kb(void)
154{
155 int congestion_kb;
156
157 /*
158 * Copied from NFS
159 *
160 * congestion size, scale with available memory.
161 *
162 * 64MB: 8192k
163 * 128MB: 11585k
164 * 256MB: 16384k
165 * 512MB: 23170k
166 * 1GB: 32768k
167 * 2GB: 46340k
168 * 4GB: 65536k
169 * 8GB: 92681k
170 * 16GB: 131072k
171 *
172 * This allows larger machines to have larger/more transfers.
173 * Limit the default to 256M
174 */
175 congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
176 if (congestion_kb > 256*1024)
177 congestion_kb = 256*1024;
178
179 return congestion_kb;
180}
181
182static int __init init_caches(void)
183{
184 ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
185 sizeof(struct ceph_inode_info),
186 __alignof__(struct ceph_inode_info),
187 (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
188 ceph_inode_init_once);
189 if (ceph_inode_cachep == NULL)
190 return -ENOMEM;
191
192 ceph_cap_cachep = KMEM_CACHE(ceph_cap,
193 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
194 if (ceph_cap_cachep == NULL)
195 goto bad_cap;
196
197 ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
198 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
199 if (ceph_dentry_cachep == NULL)
200 goto bad_dentry;
201
202 ceph_file_cachep = KMEM_CACHE(ceph_file_info,
203 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
204 if (ceph_file_cachep == NULL)
205 goto bad_file;
206
207 return 0;
208
209bad_file:
210 kmem_cache_destroy(ceph_dentry_cachep);
211bad_dentry:
212 kmem_cache_destroy(ceph_cap_cachep);
213bad_cap:
214 kmem_cache_destroy(ceph_inode_cachep);
215 return -ENOMEM;
216}
217
218static void destroy_caches(void)
219{
220 kmem_cache_destroy(ceph_inode_cachep);
221 kmem_cache_destroy(ceph_cap_cachep);
222 kmem_cache_destroy(ceph_dentry_cachep);
223 kmem_cache_destroy(ceph_file_cachep);
224}
225
226
227/*
228 * ceph_umount_begin - initiate forced umount. Tear down down the
229 * mount, skipping steps that may hang while waiting for server(s).
230 */
231static void ceph_umount_begin(struct super_block *sb)
232{
233 struct ceph_client *client = ceph_sb_to_client(sb);
234
235 dout("ceph_umount_begin - starting forced umount\n");
236 if (!client)
237 return;
238 client->mount_state = CEPH_MOUNT_SHUTDOWN;
239 return;
240}
241
242static const struct super_operations ceph_super_ops = {
243 .alloc_inode = ceph_alloc_inode,
244 .destroy_inode = ceph_destroy_inode,
245 .write_inode = ceph_write_inode,
246 .sync_fs = ceph_syncfs,
247 .put_super = ceph_put_super,
248 .show_options = ceph_show_options,
249 .statfs = ceph_statfs,
250 .umount_begin = ceph_umount_begin,
251};
252
253
254const char *ceph_msg_type_name(int type)
255{
256 switch (type) {
257 case CEPH_MSG_SHUTDOWN: return "shutdown";
258 case CEPH_MSG_PING: return "ping";
259 case CEPH_MSG_AUTH: return "auth";
260 case CEPH_MSG_AUTH_REPLY: return "auth_reply";
261 case CEPH_MSG_MON_MAP: return "mon_map";
262 case CEPH_MSG_MON_GET_MAP: return "mon_get_map";
263 case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe";
264 case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";
265 case CEPH_MSG_STATFS: return "statfs";
266 case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
267 case CEPH_MSG_MDS_MAP: return "mds_map";
268 case CEPH_MSG_CLIENT_SESSION: return "client_session";
269 case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
270 case CEPH_MSG_CLIENT_REQUEST: return "client_request";
271 case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward";
272 case CEPH_MSG_CLIENT_REPLY: return "client_reply";
273 case CEPH_MSG_CLIENT_CAPS: return "client_caps";
274 case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
275 case CEPH_MSG_CLIENT_SNAP: return "client_snap";
276 case CEPH_MSG_CLIENT_LEASE: return "client_lease";
277 case CEPH_MSG_OSD_MAP: return "osd_map";
278 case CEPH_MSG_OSD_OP: return "osd_op";
279 case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
280 default: return "unknown";
281 }
282}
283
284
285/*
286 * mount options
287 */
288enum {
289 Opt_fsidmajor,
290 Opt_fsidminor,
291 Opt_monport,
292 Opt_wsize,
293 Opt_rsize,
294 Opt_osdtimeout,
295 Opt_osdkeepalivetimeout,
296 Opt_mount_timeout,
297 Opt_osd_idle_ttl,
298 Opt_caps_wanted_delay_min,
299 Opt_caps_wanted_delay_max,
300 Opt_readdir_max_entries,
301 Opt_congestion_kb,
302 Opt_last_int,
303 /* int args above */
304 Opt_snapdirname,
305 Opt_name,
306 Opt_secret,
307 Opt_last_string,
308 /* string args above */
309 Opt_ip,
310 Opt_noshare,
311 Opt_dirstat,
312 Opt_nodirstat,
313 Opt_rbytes,
314 Opt_norbytes,
315 Opt_nocrc,
316 Opt_noasyncreaddir,
317};
318
319static match_table_t arg_tokens = {
320 {Opt_fsidmajor, "fsidmajor=%ld"},
321 {Opt_fsidminor, "fsidminor=%ld"},
322 {Opt_monport, "monport=%d"},
323 {Opt_wsize, "wsize=%d"},
324 {Opt_rsize, "rsize=%d"},
325 {Opt_osdtimeout, "osdtimeout=%d"},
326 {Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
327 {Opt_mount_timeout, "mount_timeout=%d"},
328 {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
329 {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
330 {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
331 {Opt_readdir_max_entries, "readdir_max_entries=%d"},
332 {Opt_congestion_kb, "write_congestion_kb=%d"},
333 /* int args above */
334 {Opt_snapdirname, "snapdirname=%s"},
335 {Opt_name, "name=%s"},
336 {Opt_secret, "secret=%s"},
337 /* string args above */
338 {Opt_ip, "ip=%s"},
339 {Opt_noshare, "noshare"},
340 {Opt_dirstat, "dirstat"},
341 {Opt_nodirstat, "nodirstat"},
342 {Opt_rbytes, "rbytes"},
343 {Opt_norbytes, "norbytes"},
344 {Opt_nocrc, "nocrc"},
345 {Opt_noasyncreaddir, "noasyncreaddir"},
346 {-1, NULL}
347};
348
349
350static struct ceph_mount_args *parse_mount_args(int flags, char *options,
351 const char *dev_name,
352 const char **path)
353{
354 struct ceph_mount_args *args;
355 const char *c;
356 int err = -ENOMEM;
357 substring_t argstr[MAX_OPT_ARGS];
358
359 args = kzalloc(sizeof(*args), GFP_KERNEL);
360 if (!args)
361 return ERR_PTR(-ENOMEM);
362 args->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*args->mon_addr),
363 GFP_KERNEL);
364 if (!args->mon_addr)
365 goto out;
366
367 dout("parse_mount_args %p, dev_name '%s'\n", args, dev_name);
368
369 /* start with defaults */
370 args->sb_flags = flags;
371 args->flags = CEPH_OPT_DEFAULT;
372 args->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT;
373 args->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
374 args->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
375 args->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */
376 args->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
377 args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
378 args->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
379 args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
380 args->cap_release_safety = CEPH_CAPS_PER_RELEASE * 4;
381 args->max_readdir = 1024;
382 args->congestion_kb = default_congestion_kb();
383
384 /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
385 err = -EINVAL;
386 if (!dev_name)
387 goto out;
388 *path = strstr(dev_name, ":/");
389 if (*path == NULL) {
390 pr_err("device name is missing path (no :/ in %s)\n",
391 dev_name);
392 goto out;
393 }
394
395 /* get mon ip(s) */
396 err = ceph_parse_ips(dev_name, *path, args->mon_addr,
397 CEPH_MAX_MON, &args->num_mon);
398 if (err < 0)
399 goto out;
400
401 /* path on server */
402 *path += 2;
403 dout("server path '%s'\n", *path);
404
405 /* parse mount options */
406 while ((c = strsep(&options, ",")) != NULL) {
407 int token, intval, ret;
408 if (!*c)
409 continue;
410 err = -EINVAL;
411 token = match_token((char *)c, arg_tokens, argstr);
412 if (token < 0) {
413 pr_err("bad mount option at '%s'\n", c);
414 goto out;
415 }
416 if (token < Opt_last_int) {
417 ret = match_int(&argstr[0], &intval);
418 if (ret < 0) {
419 pr_err("bad mount option arg (not int) "
420 "at '%s'\n", c);
421 continue;
422 }
423 dout("got int token %d val %d\n", token, intval);
424 } else if (token > Opt_last_int && token < Opt_last_string) {
425 dout("got string token %d val %s\n", token,
426 argstr[0].from);
427 } else {
428 dout("got token %d\n", token);
429 }
430 switch (token) {
431 case Opt_fsidmajor:
432 *(__le64 *)&args->fsid.fsid[0] = cpu_to_le64(intval);
433 break;
434 case Opt_fsidminor:
435 *(__le64 *)&args->fsid.fsid[8] = cpu_to_le64(intval);
436 break;
437 case Opt_ip:
438 err = ceph_parse_ips(argstr[0].from,
439 argstr[0].to,
440 &args->my_addr,
441 1, NULL);
442 if (err < 0)
443 goto out;
444 args->flags |= CEPH_OPT_MYIP;
445 break;
446
447 case Opt_snapdirname:
448 kfree(args->snapdir_name);
449 args->snapdir_name = kstrndup(argstr[0].from,
450 argstr[0].to-argstr[0].from,
451 GFP_KERNEL);
452 break;
453 case Opt_name:
454 args->name = kstrndup(argstr[0].from,
455 argstr[0].to-argstr[0].from,
456 GFP_KERNEL);
457 break;
458 case Opt_secret:
459 args->secret = kstrndup(argstr[0].from,
460 argstr[0].to-argstr[0].from,
461 GFP_KERNEL);
462 break;
463
464 /* misc */
465 case Opt_wsize:
466 args->wsize = intval;
467 break;
468 case Opt_rsize:
469 args->rsize = intval;
470 break;
471 case Opt_osdtimeout:
472 args->osd_timeout = intval;
473 break;
474 case Opt_osdkeepalivetimeout:
475 args->osd_keepalive_timeout = intval;
476 break;
477 case Opt_mount_timeout:
478 args->mount_timeout = intval;
479 break;
480 case Opt_caps_wanted_delay_min:
481 args->caps_wanted_delay_min = intval;
482 break;
483 case Opt_caps_wanted_delay_max:
484 args->caps_wanted_delay_max = intval;
485 break;
486 case Opt_readdir_max_entries:
487 args->max_readdir = intval;
488 break;
489 case Opt_congestion_kb:
490 args->congestion_kb = intval;
491 break;
492
493 case Opt_noshare:
494 args->flags |= CEPH_OPT_NOSHARE;
495 break;
496
497 case Opt_dirstat:
498 args->flags |= CEPH_OPT_DIRSTAT;
499 break;
500 case Opt_nodirstat:
501 args->flags &= ~CEPH_OPT_DIRSTAT;
502 break;
503 case Opt_rbytes:
504 args->flags |= CEPH_OPT_RBYTES;
505 break;
506 case Opt_norbytes:
507 args->flags &= ~CEPH_OPT_RBYTES;
508 break;
509 case Opt_nocrc:
510 args->flags |= CEPH_OPT_NOCRC;
511 break;
512 case Opt_noasyncreaddir:
513 args->flags |= CEPH_OPT_NOASYNCREADDIR;
514 break;
515
516 default:
517 BUG_ON(token);
518 }
519 }
520 return args;
521
522out:
523 kfree(args->mon_addr);
524 kfree(args);
525 return ERR_PTR(err);
526}
527
528static void destroy_mount_args(struct ceph_mount_args *args)
529{
530 dout("destroy_mount_args %p\n", args);
531 kfree(args->snapdir_name);
532 args->snapdir_name = NULL;
533 kfree(args->name);
534 args->name = NULL;
535 kfree(args->secret);
536 args->secret = NULL;
537 kfree(args);
538}
539
540/*
541 * create a fresh client instance
542 */
543static struct ceph_client *ceph_create_client(struct ceph_mount_args *args)
544{
545 struct ceph_client *client;
546 int err = -ENOMEM;
547
548 client = kzalloc(sizeof(*client), GFP_KERNEL);
549 if (client == NULL)
550 return ERR_PTR(-ENOMEM);
551
552 mutex_init(&client->mount_mutex);
553
554 init_waitqueue_head(&client->auth_wq);
555
556 client->sb = NULL;
557 client->mount_state = CEPH_MOUNT_MOUNTING;
558 client->mount_args = args;
559
560 client->msgr = NULL;
561
562 client->auth_err = 0;
563 atomic_long_set(&client->writeback_count, 0);
564
565 err = bdi_init(&client->backing_dev_info);
566 if (err < 0)
567 goto fail;
568
569 err = -ENOMEM;
570 client->wb_wq = create_workqueue("ceph-writeback");
571 if (client->wb_wq == NULL)
572 goto fail_bdi;
573 client->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid");
574 if (client->pg_inv_wq == NULL)
575 goto fail_wb_wq;
576 client->trunc_wq = create_singlethread_workqueue("ceph-trunc");
577 if (client->trunc_wq == NULL)
578 goto fail_pg_inv_wq;
579
580 /* set up mempools */
581 err = -ENOMEM;
582 client->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
583 client->mount_args->wsize >> PAGE_CACHE_SHIFT);
584 if (!client->wb_pagevec_pool)
585 goto fail_trunc_wq;
586
587 /* caps */
588 client->min_caps = args->max_readdir;
589 ceph_adjust_min_caps(client->min_caps);
590
591 /* subsystems */
592 err = ceph_monc_init(&client->monc, client);
593 if (err < 0)
594 goto fail_mempool;
595 err = ceph_osdc_init(&client->osdc, client);
596 if (err < 0)
597 goto fail_monc;
598 err = ceph_mdsc_init(&client->mdsc, client);
599 if (err < 0)
600 goto fail_osdc;
601 return client;
602
603fail_osdc:
604 ceph_osdc_stop(&client->osdc);
605fail_monc:
606 ceph_monc_stop(&client->monc);
607fail_mempool:
608 mempool_destroy(client->wb_pagevec_pool);
609fail_trunc_wq:
610 destroy_workqueue(client->trunc_wq);
611fail_pg_inv_wq:
612 destroy_workqueue(client->pg_inv_wq);
613fail_wb_wq:
614 destroy_workqueue(client->wb_wq);
615fail_bdi:
616 bdi_destroy(&client->backing_dev_info);
617fail:
618 kfree(client);
619 return ERR_PTR(err);
620}
621
622static void ceph_destroy_client(struct ceph_client *client)
623{
624 dout("destroy_client %p\n", client);
625
626 /* unmount */
627 ceph_mdsc_stop(&client->mdsc);
628 ceph_monc_stop(&client->monc);
629 ceph_osdc_stop(&client->osdc);
630
631 ceph_adjust_min_caps(-client->min_caps);
632
633 ceph_debugfs_client_cleanup(client);
634 destroy_workqueue(client->wb_wq);
635 destroy_workqueue(client->pg_inv_wq);
636 destroy_workqueue(client->trunc_wq);
637
638 if (client->msgr)
639 ceph_messenger_destroy(client->msgr);
640 mempool_destroy(client->wb_pagevec_pool);
641
642 destroy_mount_args(client->mount_args);
643
644 kfree(client);
645 dout("destroy_client %p done\n", client);
646}
647
648/*
649 * Initially learn our fsid, or verify an fsid matches.
650 */
651int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
652{
653 if (client->have_fsid) {
654 if (ceph_fsid_compare(&client->fsid, fsid)) {
655 pr_err("bad fsid, had " FSID_FORMAT " got " FSID_FORMAT,
656 PR_FSID(&client->fsid), PR_FSID(fsid));
657 return -1;
658 }
659 } else {
660 pr_info("client%lld fsid " FSID_FORMAT "\n",
661 client->monc.auth->global_id, PR_FSID(fsid));
662 memcpy(&client->fsid, fsid, sizeof(*fsid));
663 ceph_debugfs_client_init(client);
664 client->have_fsid = true;
665 }
666 return 0;
667}
668
669/*
670 * true if we have the mon map (and have thus joined the cluster)
671 */
672static int have_mon_map(struct ceph_client *client)
673{
674 return client->monc.monmap && client->monc.monmap->epoch;
675}
676
677/*
678 * Bootstrap mount by opening the root directory. Note the mount
679 * @started time from caller, and time out if this takes too long.
680 */
681static struct dentry *open_root_dentry(struct ceph_client *client,
682 const char *path,
683 unsigned long started)
684{
685 struct ceph_mds_client *mdsc = &client->mdsc;
686 struct ceph_mds_request *req = NULL;
687 int err;
688 struct dentry *root;
689
690 /* open dir */
691 dout("open_root_inode opening '%s'\n", path);
692 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
693 if (IS_ERR(req))
694 return ERR_PTR(PTR_ERR(req));
695 req->r_path1 = kstrdup(path, GFP_NOFS);
696 req->r_ino1.ino = CEPH_INO_ROOT;
697 req->r_ino1.snap = CEPH_NOSNAP;
698 req->r_started = started;
699 req->r_timeout = client->mount_args->mount_timeout * HZ;
700 req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
701 req->r_num_caps = 2;
702 err = ceph_mdsc_do_request(mdsc, NULL, req);
703 if (err == 0) {
704 dout("open_root_inode success\n");
705 if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT &&
706 client->sb->s_root == NULL)
707 root = d_alloc_root(req->r_target_inode);
708 else
709 root = d_obtain_alias(req->r_target_inode);
710 req->r_target_inode = NULL;
711 dout("open_root_inode success, root dentry is %p\n", root);
712 } else {
713 root = ERR_PTR(err);
714 }
715 ceph_mdsc_put_request(req);
716 return root;
717}
718
719/*
720 * mount: join the ceph cluster, and open root directory.
721 */
722static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
723 const char *path)
724{
725 struct ceph_entity_addr *myaddr = NULL;
726 int err;
727 unsigned long timeout = client->mount_args->mount_timeout * HZ;
728 unsigned long started = jiffies; /* note the start time */
729 struct dentry *root;
730
731 dout("mount start\n");
732 mutex_lock(&client->mount_mutex);
733
734 /* initialize the messenger */
735 if (client->msgr == NULL) {
736 if (ceph_test_opt(client, MYIP))
737 myaddr = &client->mount_args->my_addr;
738 client->msgr = ceph_messenger_create(myaddr);
739 if (IS_ERR(client->msgr)) {
740 err = PTR_ERR(client->msgr);
741 client->msgr = NULL;
742 goto out;
743 }
744 client->msgr->nocrc = ceph_test_opt(client, NOCRC);
745 }
746
747 /* open session, and wait for mon, mds, and osd maps */
748 err = ceph_monc_open_session(&client->monc);
749 if (err < 0)
750 goto out;
751
752 while (!have_mon_map(client)) {
753 err = -EIO;
754 if (timeout && time_after_eq(jiffies, started + timeout))
755 goto out;
756
757 /* wait */
758 dout("mount waiting for mon_map\n");
759 err = wait_event_interruptible_timeout(client->auth_wq,
760 have_mon_map(client) || (client->auth_err < 0),
761 timeout);
762 if (err == -EINTR || err == -ERESTARTSYS)
763 goto out;
764 if (client->auth_err < 0) {
765 err = client->auth_err;
766 goto out;
767 }
768 }
769
770 dout("mount opening root\n");
771 root = open_root_dentry(client, "", started);
772 if (IS_ERR(root)) {
773 err = PTR_ERR(root);
774 goto out;
775 }
776 if (client->sb->s_root)
777 dput(root);
778 else
779 client->sb->s_root = root;
780
781 if (path[0] == 0) {
782 dget(root);
783 } else {
784 dout("mount opening base mountpoint\n");
785 root = open_root_dentry(client, path, started);
786 if (IS_ERR(root)) {
787 err = PTR_ERR(root);
788 dput(client->sb->s_root);
789 client->sb->s_root = NULL;
790 goto out;
791 }
792 }
793
794 mnt->mnt_root = root;
795 mnt->mnt_sb = client->sb;
796
797 client->mount_state = CEPH_MOUNT_MOUNTED;
798 dout("mount success\n");
799 err = 0;
800
801out:
802 mutex_unlock(&client->mount_mutex);
803 return err;
804}
805
806static int ceph_set_super(struct super_block *s, void *data)
807{
808 struct ceph_client *client = data;
809 int ret;
810
811 dout("set_super %p data %p\n", s, data);
812
813 s->s_flags = client->mount_args->sb_flags;
814 s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */
815
816 s->s_fs_info = client;
817 client->sb = s;
818
819 s->s_op = &ceph_super_ops;
820 s->s_export_op = &ceph_export_ops;
821
822 s->s_time_gran = 1000; /* 1000 ns == 1 us */
823
824 ret = set_anon_super(s, NULL); /* what is that second arg for? */
825 if (ret != 0)
826 goto fail;
827
828 return ret;
829
830fail:
831 s->s_fs_info = NULL;
832 client->sb = NULL;
833 return ret;
834}
835
836/*
837 * share superblock if same fs AND options
838 */
839static int ceph_compare_super(struct super_block *sb, void *data)
840{
841 struct ceph_client *new = data;
842 struct ceph_mount_args *args = new->mount_args;
843 struct ceph_client *other = ceph_sb_to_client(sb);
844 int i;
845
846 dout("ceph_compare_super %p\n", sb);
847 if (args->flags & CEPH_OPT_FSID) {
848 if (ceph_fsid_compare(&args->fsid, &other->fsid)) {
849 dout("fsid doesn't match\n");
850 return 0;
851 }
852 } else {
853 /* do we share (a) monitor? */
854 for (i = 0; i < new->monc.monmap->num_mon; i++)
855 if (ceph_monmap_contains(other->monc.monmap,
856 &new->monc.monmap->mon_inst[i].addr))
857 break;
858 if (i == new->monc.monmap->num_mon) {
859 dout("mon ip not part of monmap\n");
860 return 0;
861 }
862 dout("mon ip matches existing sb %p\n", sb);
863 }
864 if (args->sb_flags != other->mount_args->sb_flags) {
865 dout("flags differ\n");
866 return 0;
867 }
868 return 1;
869}
870
871/*
872 * construct our own bdi so we can control readahead, etc.
873 */
874static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
875{
876 int err;
877
878 sb->s_bdi = &client->backing_dev_info;
879
880 /* set ra_pages based on rsize mount option? */
881 if (client->mount_args->rsize >= PAGE_CACHE_SIZE)
882 client->backing_dev_info.ra_pages =
883 (client->mount_args->rsize + PAGE_CACHE_SIZE - 1)
884 >> PAGE_SHIFT;
885 err = bdi_register_dev(&client->backing_dev_info, sb->s_dev);
886 return err;
887}
888
889static int ceph_get_sb(struct file_system_type *fs_type,
890 int flags, const char *dev_name, void *data,
891 struct vfsmount *mnt)
892{
893 struct super_block *sb;
894 struct ceph_client *client;
895 int err;
896 int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
897 const char *path = NULL;
898 struct ceph_mount_args *args;
899
900 dout("ceph_get_sb\n");
901 args = parse_mount_args(flags, data, dev_name, &path);
902 if (IS_ERR(args)) {
903 err = PTR_ERR(args);
904 goto out_final;
905 }
906
907 /* create client (which we may/may not use) */
908 client = ceph_create_client(args);
909 if (IS_ERR(client)) {
910 err = PTR_ERR(client);
911 goto out_final;
912 }
913
914 if (client->mount_args->flags & CEPH_OPT_NOSHARE)
915 compare_super = NULL;
916 sb = sget(fs_type, compare_super, ceph_set_super, client);
917 if (IS_ERR(sb)) {
918 err = PTR_ERR(sb);
919 goto out;
920 }
921
922 if (ceph_client(sb) != client) {
923 ceph_destroy_client(client);
924 client = ceph_client(sb);
925 dout("get_sb got existing client %p\n", client);
926 } else {
927 dout("get_sb using new client %p\n", client);
928 err = ceph_register_bdi(sb, client);
929 if (err < 0)
930 goto out_splat;
931 }
932
933 err = ceph_mount(client, mnt, path);
934 if (err < 0)
935 goto out_splat;
936 dout("root %p inode %p ino %llx.%llx\n", mnt->mnt_root,
937 mnt->mnt_root->d_inode, ceph_vinop(mnt->mnt_root->d_inode));
938 return 0;
939
940out_splat:
941 ceph_mdsc_close_sessions(&client->mdsc);
942 up_write(&sb->s_umount);
943 deactivate_super(sb);
944 goto out_final;
945
946out:
947 ceph_destroy_client(client);
948out_final:
949 dout("ceph_get_sb fail %d\n", err);
950 return err;
951}
952
953static void ceph_kill_sb(struct super_block *s)
954{
955 struct ceph_client *client = ceph_sb_to_client(s);
956 dout("kill_sb %p\n", s);
957 ceph_mdsc_pre_umount(&client->mdsc);
958 kill_anon_super(s); /* will call put_super after sb is r/o */
959 if (s->s_bdi == &client->backing_dev_info)
960 bdi_unregister(&client->backing_dev_info);
961 bdi_destroy(&client->backing_dev_info);
962 ceph_destroy_client(client);
963}
964
965static struct file_system_type ceph_fs_type = {
966 .owner = THIS_MODULE,
967 .name = "ceph",
968 .get_sb = ceph_get_sb,
969 .kill_sb = ceph_kill_sb,
970 .fs_flags = FS_RENAME_DOES_D_MOVE,
971};
972
973#define _STRINGIFY(x) #x
974#define STRINGIFY(x) _STRINGIFY(x)
975
976static int __init init_ceph(void)
977{
978 int ret = 0;
979
980 ret = ceph_debugfs_init();
981 if (ret < 0)
982 goto out;
983
984 ret = ceph_msgr_init();
985 if (ret < 0)
986 goto out_debugfs;
987
988 ret = init_caches();
989 if (ret)
990 goto out_msgr;
991
992 ceph_caps_init();
993
994 ret = register_filesystem(&ceph_fs_type);
995 if (ret)
996 goto out_icache;
997
998 pr_info("loaded %d.%d.%d (mon/mds/osd proto %d/%d/%d)\n",
999 CEPH_VERSION_MAJOR, CEPH_VERSION_MINOR, CEPH_VERSION_PATCH,
1000 CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL);
1001 return 0;
1002
1003out_icache:
1004 destroy_caches();
1005out_msgr:
1006 ceph_msgr_exit();
1007out_debugfs:
1008 ceph_debugfs_cleanup();
1009out:
1010 return ret;
1011}
1012
1013static void __exit exit_ceph(void)
1014{
1015 dout("exit_ceph\n");
1016 unregister_filesystem(&ceph_fs_type);
1017 ceph_caps_finalize();
1018 destroy_caches();
1019 ceph_msgr_exit();
1020 ceph_debugfs_cleanup();
1021}
1022
1023module_init(init_ceph);
1024module_exit(exit_ceph);
1025
1026MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
1027MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
1028MODULE_AUTHOR("Patience Warnick <patience@newdream.net>");
1029MODULE_DESCRIPTION("Ceph filesystem for Linux");
1030MODULE_LICENSE("GPL");
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
new file mode 100644
index 000000000000..65d12036b670
--- /dev/null
+++ b/fs/ceph/super.h
@@ -0,0 +1,901 @@
1#ifndef _FS_CEPH_SUPER_H
2#define _FS_CEPH_SUPER_H
3
4#include "ceph_debug.h"
5
6#include <asm/unaligned.h>
7#include <linux/backing-dev.h>
8#include <linux/completion.h>
9#include <linux/exportfs.h>
10#include <linux/fs.h>
11#include <linux/mempool.h>
12#include <linux/pagemap.h>
13#include <linux/wait.h>
14#include <linux/writeback.h>
15
16#include "types.h"
17#include "messenger.h"
18#include "msgpool.h"
19#include "mon_client.h"
20#include "mds_client.h"
21#include "osd_client.h"
22#include "ceph_fs.h"
23
24/* f_type in struct statfs */
25#define CEPH_SUPER_MAGIC 0x00c36400
26
27/* large granularity for statfs utilization stats to facilitate
28 * large volume sizes on 32-bit machines. */
29#define CEPH_BLOCK_SHIFT 20 /* 1 MB */
30#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT)
31
32/*
33 * mount options
34 */
35#define CEPH_OPT_FSID (1<<0)
36#define CEPH_OPT_NOSHARE (1<<1) /* don't share client with other sbs */
37#define CEPH_OPT_MYIP (1<<2) /* specified my ip */
38#define CEPH_OPT_DIRSTAT (1<<4) /* funky `cat dirname` for stats */
39#define CEPH_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
40#define CEPH_OPT_NOCRC (1<<6) /* no data crc on writes */
41#define CEPH_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
42
43#define CEPH_OPT_DEFAULT (CEPH_OPT_RBYTES)
44
45#define ceph_set_opt(client, opt) \
46 (client)->mount_args->flags |= CEPH_OPT_##opt;
47#define ceph_test_opt(client, opt) \
48 (!!((client)->mount_args->flags & CEPH_OPT_##opt))
49
50
51struct ceph_mount_args {
52 int sb_flags;
53 int num_mon;
54 struct ceph_entity_addr *mon_addr;
55 int flags;
56 int mount_timeout;
57 int osd_idle_ttl;
58 int caps_wanted_delay_min, caps_wanted_delay_max;
59 struct ceph_fsid fsid;
60 struct ceph_entity_addr my_addr;
61 int wsize;
62 int rsize; /* max readahead */
63 int max_readdir; /* max readdir size */
64 int congestion_kb; /* max readdir size */
65 int osd_timeout;
66 int osd_keepalive_timeout;
67 char *snapdir_name; /* default ".snap" */
68 char *name;
69 char *secret;
70 int cap_release_safety;
71};
72
73/*
74 * defaults
75 */
76#define CEPH_MOUNT_TIMEOUT_DEFAULT 60
77#define CEPH_OSD_TIMEOUT_DEFAULT 60 /* seconds */
78#define CEPH_OSD_KEEPALIVE_DEFAULT 5
79#define CEPH_OSD_IDLE_TTL_DEFAULT 60
80#define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */
81
82#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
83#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024)
84
85#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
86#define CEPH_AUTH_NAME_DEFAULT "guest"
87
88/*
89 * Delay telling the MDS we no longer want caps, in case we reopen
90 * the file. Delay a minimum amount of time, even if we send a cap
91 * message for some other reason. Otherwise, take the oppotunity to
92 * update the mds to avoid sending another message later.
93 */
94#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */
95#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */
96
97
98/* mount state */
99enum {
100 CEPH_MOUNT_MOUNTING,
101 CEPH_MOUNT_MOUNTED,
102 CEPH_MOUNT_UNMOUNTING,
103 CEPH_MOUNT_UNMOUNTED,
104 CEPH_MOUNT_SHUTDOWN,
105};
106
107/*
108 * subtract jiffies
109 */
110static inline unsigned long time_sub(unsigned long a, unsigned long b)
111{
112 BUG_ON(time_after(b, a));
113 return (long)a - (long)b;
114}
115
116/*
117 * per-filesystem client state
118 *
119 * possibly shared by multiple mount points, if they are
120 * mounting the same ceph filesystem/cluster.
121 */
122struct ceph_client {
123 struct ceph_fsid fsid;
124 bool have_fsid;
125
126 struct mutex mount_mutex; /* serialize mount attempts */
127 struct ceph_mount_args *mount_args;
128
129 struct super_block *sb;
130
131 unsigned long mount_state;
132 wait_queue_head_t auth_wq;
133
134 int auth_err;
135
136 int min_caps; /* min caps i added */
137
138 struct ceph_messenger *msgr; /* messenger instance */
139 struct ceph_mon_client monc;
140 struct ceph_mds_client mdsc;
141 struct ceph_osd_client osdc;
142
143 /* writeback */
144 mempool_t *wb_pagevec_pool;
145 struct workqueue_struct *wb_wq;
146 struct workqueue_struct *pg_inv_wq;
147 struct workqueue_struct *trunc_wq;
148 atomic_long_t writeback_count;
149
150 struct backing_dev_info backing_dev_info;
151
152#ifdef CONFIG_DEBUG_FS
153 struct dentry *debugfs_monmap;
154 struct dentry *debugfs_mdsmap, *debugfs_osdmap;
155 struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps;
156 struct dentry *debugfs_congestion_kb;
157 struct dentry *debugfs_bdi;
158#endif
159};
160
161static inline struct ceph_client *ceph_client(struct super_block *sb)
162{
163 return sb->s_fs_info;
164}
165
166
167/*
168 * File i/o capability. This tracks shared state with the metadata
169 * server that allows us to cache or writeback attributes or to read
170 * and write data. For any given inode, we should have one or more
171 * capabilities, one issued by each metadata server, and our
172 * cumulative access is the OR of all issued capabilities.
173 *
174 * Each cap is referenced by the inode's i_caps rbtree and by per-mds
175 * session capability lists.
176 */
177struct ceph_cap {
178 struct ceph_inode_info *ci;
179 struct rb_node ci_node; /* per-ci cap tree */
180 struct ceph_mds_session *session;
181 struct list_head session_caps; /* per-session caplist */
182 int mds;
183 u64 cap_id; /* unique cap id (mds provided) */
184 int issued; /* latest, from the mds */
185 int implemented; /* implemented superset of issued (for revocation) */
186 int mds_wanted;
187 u32 seq, issue_seq, mseq;
188 u32 cap_gen; /* active/stale cycle */
189 unsigned long last_used;
190 struct list_head caps_item;
191};
192
193#define CHECK_CAPS_NODELAY 1 /* do not delay any further */
194#define CHECK_CAPS_AUTHONLY 2 /* only check auth cap */
195#define CHECK_CAPS_FLUSH 4 /* flush any dirty caps */
196
197/*
198 * Snapped cap state that is pending flush to mds. When a snapshot occurs,
199 * we first complete any in-process sync writes and writeback any dirty
200 * data before flushing the snapped state (tracked here) back to the MDS.
201 */
202struct ceph_cap_snap {
203 atomic_t nref;
204 struct ceph_inode_info *ci;
205 struct list_head ci_item, flushing_item;
206
207 u64 follows, flush_tid;
208 int issued, dirty;
209 struct ceph_snap_context *context;
210
211 mode_t mode;
212 uid_t uid;
213 gid_t gid;
214
215 void *xattr_blob;
216 int xattr_len;
217 u64 xattr_version;
218
219 u64 size;
220 struct timespec mtime, atime, ctime;
221 u64 time_warp_seq;
222 int writing; /* a sync write is still in progress */
223 int dirty_pages; /* dirty pages awaiting writeback */
224};
225
226static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
227{
228 if (atomic_dec_and_test(&capsnap->nref))
229 kfree(capsnap);
230}
231
232/*
233 * The frag tree describes how a directory is fragmented, potentially across
234 * multiple metadata servers. It is also used to indicate points where
235 * metadata authority is delegated, and whether/where metadata is replicated.
236 *
237 * A _leaf_ frag will be present in the i_fragtree IFF there is
238 * delegation info. That is, if mds >= 0 || ndist > 0.
239 */
240#define CEPH_MAX_DIRFRAG_REP 4
241
242struct ceph_inode_frag {
243 struct rb_node node;
244
245 /* fragtree state */
246 u32 frag;
247 int split_by; /* i.e. 2^(split_by) children */
248
249 /* delegation and replication info */
250 int mds; /* -1 if same authority as parent */
251 int ndist; /* >0 if replicated */
252 int dist[CEPH_MAX_DIRFRAG_REP];
253};
254
255/*
256 * We cache inode xattrs as an encoded blob until they are first used,
257 * at which point we parse them into an rbtree.
258 */
259struct ceph_inode_xattr {
260 struct rb_node node;
261
262 const char *name;
263 int name_len;
264 const char *val;
265 int val_len;
266 int dirty;
267
268 int should_free_name;
269 int should_free_val;
270};
271
272struct ceph_inode_xattrs_info {
273 /*
274 * (still encoded) xattr blob. we avoid the overhead of parsing
275 * this until someone actually calls getxattr, etc.
276 *
277 * blob->vec.iov_len == 4 implies there are no xattrs; blob ==
278 * NULL means we don't know.
279 */
280 struct ceph_buffer *blob, *prealloc_blob;
281
282 struct rb_root index;
283 bool dirty;
284 int count;
285 int names_size;
286 int vals_size;
287 u64 version, index_version;
288};
289
290/*
291 * Ceph inode.
292 */
293#define CEPH_I_COMPLETE 1 /* we have complete directory cached */
294#define CEPH_I_NODELAY 4 /* do not delay cap release */
295#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */
296#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */
297
298struct ceph_inode_info {
299 struct ceph_vino i_vino; /* ceph ino + snap */
300
301 u64 i_version;
302 u32 i_time_warp_seq;
303
304 unsigned i_ceph_flags;
305 unsigned long i_release_count;
306
307 struct ceph_file_layout i_layout;
308 char *i_symlink;
309
310 /* for dirs */
311 struct timespec i_rctime;
312 u64 i_rbytes, i_rfiles, i_rsubdirs;
313 u64 i_files, i_subdirs;
314 u64 i_max_offset; /* largest readdir offset, set with I_COMPLETE */
315
316 struct rb_root i_fragtree;
317 struct mutex i_fragtree_mutex;
318
319 struct ceph_inode_xattrs_info i_xattrs;
320
321 /* capabilities. protected _both_ by i_lock and cap->session's
322 * s_mutex. */
323 struct rb_root i_caps; /* cap list */
324 struct ceph_cap *i_auth_cap; /* authoritative cap, if any */
325 unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */
326 struct list_head i_dirty_item, i_flushing_item;
327 u64 i_cap_flush_seq;
328 /* we need to track cap writeback on a per-cap-bit basis, to allow
329 * overlapping, pipelined cap flushes to the mds. we can probably
330 * reduce the tid to 8 bits if we're concerned about inode size. */
331 u16 i_cap_flush_last_tid, i_cap_flush_tid[CEPH_CAP_BITS];
332 wait_queue_head_t i_cap_wq; /* threads waiting on a capability */
333 unsigned long i_hold_caps_min; /* jiffies */
334 unsigned long i_hold_caps_max; /* jiffies */
335 struct list_head i_cap_delay_list; /* for delayed cap release to mds */
336 int i_cap_exporting_mds; /* to handle cap migration between */
337 unsigned i_cap_exporting_mseq; /* mds's. */
338 unsigned i_cap_exporting_issued;
339 struct ceph_cap_reservation i_cap_migration_resv;
340 struct list_head i_cap_snaps; /* snapped state pending flush to mds */
341 struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 */
342 unsigned i_snap_caps; /* cap bits for snapped files */
343
344 int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */
345
346 u32 i_truncate_seq; /* last truncate to smaller size */
347 u64 i_truncate_size; /* and the size we last truncated down to */
348 int i_truncate_pending; /* still need to call vmtruncate */
349
350 u64 i_max_size; /* max file size authorized by mds */
351 u64 i_reported_size; /* (max_)size reported to or requested of mds */
352 u64 i_wanted_max_size; /* offset we'd like to write too */
353 u64 i_requested_max_size; /* max_size we've requested */
354
355 /* held references to caps */
356 int i_pin_ref;
357 int i_rd_ref, i_rdcache_ref, i_wr_ref;
358 int i_wrbuffer_ref, i_wrbuffer_ref_head;
359 u32 i_shared_gen; /* increment each time we get FILE_SHARED */
360 u32 i_rdcache_gen; /* we increment this each time we get
361 FILE_CACHE. If it's non-zero, we
362 _may_ have cached pages. */
363 u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */
364
365 struct list_head i_unsafe_writes; /* uncommitted sync writes */
366 struct list_head i_unsafe_dirops; /* uncommitted mds dir ops */
367 spinlock_t i_unsafe_lock;
368
369 struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
370 int i_snap_realm_counter; /* snap realm (if caps) */
371 struct list_head i_snap_realm_item;
372 struct list_head i_snap_flush_item;
373
374 struct work_struct i_wb_work; /* writeback work */
375 struct work_struct i_pg_inv_work; /* page invalidation work */
376
377 struct work_struct i_vmtruncate_work;
378
379 struct inode vfs_inode; /* at end */
380};
381
382static inline struct ceph_inode_info *ceph_inode(struct inode *inode)
383{
384 return container_of(inode, struct ceph_inode_info, vfs_inode);
385}
386
387static inline void ceph_i_clear(struct inode *inode, unsigned mask)
388{
389 struct ceph_inode_info *ci = ceph_inode(inode);
390
391 spin_lock(&inode->i_lock);
392 ci->i_ceph_flags &= ~mask;
393 spin_unlock(&inode->i_lock);
394}
395
396static inline void ceph_i_set(struct inode *inode, unsigned mask)
397{
398 struct ceph_inode_info *ci = ceph_inode(inode);
399
400 spin_lock(&inode->i_lock);
401 ci->i_ceph_flags |= mask;
402 spin_unlock(&inode->i_lock);
403}
404
405static inline bool ceph_i_test(struct inode *inode, unsigned mask)
406{
407 struct ceph_inode_info *ci = ceph_inode(inode);
408 bool r;
409
410 smp_mb();
411 r = (ci->i_ceph_flags & mask) == mask;
412 return r;
413}
414
415
416/* find a specific frag @f */
417extern struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci,
418 u32 f);
419
420/*
421 * choose fragment for value @v. copy frag content to pfrag, if leaf
422 * exists
423 */
424extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
425 struct ceph_inode_frag *pfrag,
426 int *found);
427
428/*
429 * Ceph dentry state
430 */
431struct ceph_dentry_info {
432 struct ceph_mds_session *lease_session;
433 u32 lease_gen, lease_shared_gen;
434 u32 lease_seq;
435 unsigned long lease_renew_after, lease_renew_from;
436 struct list_head lru;
437 struct dentry *dentry;
438 u64 time;
439 u64 offset;
440};
441
442static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
443{
444 return (struct ceph_dentry_info *)dentry->d_fsdata;
445}
446
447static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
448{
449 return ((loff_t)frag << 32) | (loff_t)off;
450}
451
452/*
453 * ino_t is <64 bits on many architectures, blech.
454 *
455 * don't include snap in ino hash, at least for now.
456 */
457static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
458{
459 ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */
460#if BITS_PER_LONG == 32
461 ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8;
462 if (!ino)
463 ino = 1;
464#endif
465 return ino;
466}
467
468static inline int ceph_set_ino_cb(struct inode *inode, void *data)
469{
470 ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
471 inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
472 return 0;
473}
474
475static inline struct ceph_vino ceph_vino(struct inode *inode)
476{
477 return ceph_inode(inode)->i_vino;
478}
479
480/* for printf-style formatting */
481#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
482
483static inline u64 ceph_ino(struct inode *inode)
484{
485 return ceph_inode(inode)->i_vino.ino;
486}
487static inline u64 ceph_snap(struct inode *inode)
488{
489 return ceph_inode(inode)->i_vino.snap;
490}
491
492static inline int ceph_ino_compare(struct inode *inode, void *data)
493{
494 struct ceph_vino *pvino = (struct ceph_vino *)data;
495 struct ceph_inode_info *ci = ceph_inode(inode);
496 return ci->i_vino.ino == pvino->ino &&
497 ci->i_vino.snap == pvino->snap;
498}
499
500static inline struct inode *ceph_find_inode(struct super_block *sb,
501 struct ceph_vino vino)
502{
503 ino_t t = ceph_vino_to_ino(vino);
504 return ilookup5(sb, t, ceph_ino_compare, &vino);
505}
506
507
508/*
509 * caps helpers
510 */
511static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci)
512{
513 return !RB_EMPTY_ROOT(&ci->i_caps);
514}
515
516extern int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented);
517extern int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int t);
518extern int __ceph_caps_issued_other(struct ceph_inode_info *ci,
519 struct ceph_cap *cap);
520
521static inline int ceph_caps_issued(struct ceph_inode_info *ci)
522{
523 int issued;
524 spin_lock(&ci->vfs_inode.i_lock);
525 issued = __ceph_caps_issued(ci, NULL);
526 spin_unlock(&ci->vfs_inode.i_lock);
527 return issued;
528}
529
530static inline int ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask,
531 int touch)
532{
533 int r;
534 spin_lock(&ci->vfs_inode.i_lock);
535 r = __ceph_caps_issued_mask(ci, mask, touch);
536 spin_unlock(&ci->vfs_inode.i_lock);
537 return r;
538}
539
540static inline int __ceph_caps_dirty(struct ceph_inode_info *ci)
541{
542 return ci->i_dirty_caps | ci->i_flushing_caps;
543}
544extern void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask);
545
546extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask);
547extern int __ceph_caps_used(struct ceph_inode_info *ci);
548
549extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci);
550
551/*
552 * wanted, by virtue of open file modes AND cap refs (buffered/cached data)
553 */
554static inline int __ceph_caps_wanted(struct ceph_inode_info *ci)
555{
556 int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci);
557 if (w & CEPH_CAP_FILE_BUFFER)
558 w |= CEPH_CAP_FILE_EXCL; /* we want EXCL if dirty data */
559 return w;
560}
561
562/* what the mds thinks we want */
563extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci);
564
565extern void ceph_caps_init(void);
566extern void ceph_caps_finalize(void);
567extern void ceph_adjust_min_caps(int delta);
568extern int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need);
569extern int ceph_unreserve_caps(struct ceph_cap_reservation *ctx);
570extern void ceph_reservation_status(struct ceph_client *client,
571 int *total, int *avail, int *used,
572 int *reserved, int *min);
573
574static inline struct ceph_client *ceph_inode_to_client(struct inode *inode)
575{
576 return (struct ceph_client *)inode->i_sb->s_fs_info;
577}
578
579static inline struct ceph_client *ceph_sb_to_client(struct super_block *sb)
580{
581 return (struct ceph_client *)sb->s_fs_info;
582}
583
584
585/*
586 * we keep buffered readdir results attached to file->private_data
587 */
588struct ceph_file_info {
589 int fmode; /* initialized on open */
590
591 /* readdir: position within the dir */
592 u32 frag;
593 struct ceph_mds_request *last_readdir;
594 int at_end;
595
596 /* readdir: position within a frag */
597 unsigned offset; /* offset of last chunk, adjusted for . and .. */
598 u64 next_offset; /* offset of next chunk (last_name's + 1) */
599 char *last_name; /* last entry in previous chunk */
600 struct dentry *dentry; /* next dentry (for dcache readdir) */
601 unsigned long dir_release_count;
602
603 /* used for -o dirstat read() on directory thing */
604 char *dir_info;
605 int dir_info_len;
606};
607
608
609
610/*
611 * snapshots
612 */
613
614/*
615 * A "snap context" is the set of existing snapshots when we
616 * write data. It is used by the OSD to guide its COW behavior.
617 *
618 * The ceph_snap_context is refcounted, and attached to each dirty
619 * page, indicating which context the dirty data belonged when it was
620 * dirtied.
621 */
622struct ceph_snap_context {
623 atomic_t nref;
624 u64 seq;
625 int num_snaps;
626 u64 snaps[];
627};
628
629static inline struct ceph_snap_context *
630ceph_get_snap_context(struct ceph_snap_context *sc)
631{
632 /*
633 printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
634 atomic_read(&sc->nref)+1);
635 */
636 if (sc)
637 atomic_inc(&sc->nref);
638 return sc;
639}
640
641static inline void ceph_put_snap_context(struct ceph_snap_context *sc)
642{
643 if (!sc)
644 return;
645 /*
646 printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
647 atomic_read(&sc->nref)-1);
648 */
649 if (atomic_dec_and_test(&sc->nref)) {
650 /*printk(" deleting snap_context %p\n", sc);*/
651 kfree(sc);
652 }
653}
654
655/*
656 * A "snap realm" describes a subset of the file hierarchy sharing
657 * the same set of snapshots that apply to it. The realms themselves
658 * are organized into a hierarchy, such that children inherit (some of)
659 * the snapshots of their parents.
660 *
661 * All inodes within the realm that have capabilities are linked into a
662 * per-realm list.
663 */
664struct ceph_snap_realm {
665 u64 ino;
666 atomic_t nref;
667 struct rb_node node;
668
669 u64 created, seq;
670 u64 parent_ino;
671 u64 parent_since; /* snapid when our current parent became so */
672
673 u64 *prior_parent_snaps; /* snaps inherited from any parents we */
674 int num_prior_parent_snaps; /* had prior to parent_since */
675 u64 *snaps; /* snaps specific to this realm */
676 int num_snaps;
677
678 struct ceph_snap_realm *parent;
679 struct list_head children; /* list of child realms */
680 struct list_head child_item;
681
682 struct list_head empty_item; /* if i have ref==0 */
683
684 /* the current set of snaps for this realm */
685 struct ceph_snap_context *cached_context;
686
687 struct list_head inodes_with_caps;
688 spinlock_t inodes_with_caps_lock;
689};
690
691
692
693/*
694 * calculate the number of pages a given length and offset map onto,
695 * if we align the data.
696 */
697static inline int calc_pages_for(u64 off, u64 len)
698{
699 return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) -
700 (off >> PAGE_CACHE_SHIFT);
701}
702
703
704
705/* snap.c */
706struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
707 u64 ino);
708extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
709 struct ceph_snap_realm *realm);
710extern void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
711 struct ceph_snap_realm *realm);
712extern int ceph_update_snap_trace(struct ceph_mds_client *m,
713 void *p, void *e, bool deletion);
714extern void ceph_handle_snap(struct ceph_mds_client *mdsc,
715 struct ceph_mds_session *session,
716 struct ceph_msg *msg);
717extern void ceph_queue_cap_snap(struct ceph_inode_info *ci,
718 struct ceph_snap_context *snapc);
719extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
720 struct ceph_cap_snap *capsnap);
721extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
722
723/*
724 * a cap_snap is "pending" if it is still awaiting an in-progress
725 * sync write (that may/may not still update size, mtime, etc.).
726 */
727static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
728{
729 return !list_empty(&ci->i_cap_snaps) &&
730 list_entry(ci->i_cap_snaps.prev, struct ceph_cap_snap,
731 ci_item)->writing;
732}
733
734
735/* super.c */
736extern struct kmem_cache *ceph_inode_cachep;
737extern struct kmem_cache *ceph_cap_cachep;
738extern struct kmem_cache *ceph_dentry_cachep;
739extern struct kmem_cache *ceph_file_cachep;
740
741extern const char *ceph_msg_type_name(int type);
742extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
743
744#define FSID_FORMAT "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-" \
745 "%02x%02x%02x%02x%02x%02x"
746#define PR_FSID(f) (f)->fsid[0], (f)->fsid[1], (f)->fsid[2], (f)->fsid[3], \
747 (f)->fsid[4], (f)->fsid[5], (f)->fsid[6], (f)->fsid[7], \
748 (f)->fsid[8], (f)->fsid[9], (f)->fsid[10], (f)->fsid[11], \
749 (f)->fsid[12], (f)->fsid[13], (f)->fsid[14], (f)->fsid[15]
750
751/* inode.c */
752extern const struct inode_operations ceph_file_iops;
753
754extern struct inode *ceph_alloc_inode(struct super_block *sb);
755extern void ceph_destroy_inode(struct inode *inode);
756
757extern struct inode *ceph_get_inode(struct super_block *sb,
758 struct ceph_vino vino);
759extern struct inode *ceph_get_snapdir(struct inode *parent);
760extern int ceph_fill_file_size(struct inode *inode, int issued,
761 u32 truncate_seq, u64 truncate_size, u64 size);
762extern void ceph_fill_file_time(struct inode *inode, int issued,
763 u64 time_warp_seq, struct timespec *ctime,
764 struct timespec *mtime, struct timespec *atime);
765extern int ceph_fill_trace(struct super_block *sb,
766 struct ceph_mds_request *req,
767 struct ceph_mds_session *session);
768extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
769 struct ceph_mds_session *session);
770
771extern int ceph_inode_holds_cap(struct inode *inode, int mask);
772
773extern int ceph_inode_set_size(struct inode *inode, loff_t size);
774extern void __ceph_do_pending_vmtruncate(struct inode *inode);
775extern void ceph_queue_vmtruncate(struct inode *inode);
776
777extern void ceph_queue_invalidate(struct inode *inode);
778extern void ceph_queue_writeback(struct inode *inode);
779
780extern int ceph_do_getattr(struct inode *inode, int mask);
781extern int ceph_permission(struct inode *inode, int mask);
782extern int ceph_setattr(struct dentry *dentry, struct iattr *attr);
783extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
784 struct kstat *stat);
785
786/* xattr.c */
787extern int ceph_setxattr(struct dentry *, const char *, const void *,
788 size_t, int);
789extern ssize_t ceph_getxattr(struct dentry *, const char *, void *, size_t);
790extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
791extern int ceph_removexattr(struct dentry *, const char *);
792extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci);
793extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci);
794
795/* caps.c */
796extern const char *ceph_cap_string(int c);
797extern void ceph_handle_caps(struct ceph_mds_session *session,
798 struct ceph_msg *msg);
799extern int ceph_add_cap(struct inode *inode,
800 struct ceph_mds_session *session, u64 cap_id,
801 int fmode, unsigned issued, unsigned wanted,
802 unsigned cap, unsigned seq, u64 realmino, int flags,
803 struct ceph_cap_reservation *caps_reservation);
804extern void __ceph_remove_cap(struct ceph_cap *cap);
805static inline void ceph_remove_cap(struct ceph_cap *cap)
806{
807 struct inode *inode = &cap->ci->vfs_inode;
808 spin_lock(&inode->i_lock);
809 __ceph_remove_cap(cap);
810 spin_unlock(&inode->i_lock);
811}
812extern void ceph_put_cap(struct ceph_cap *cap);
813
814extern void ceph_queue_caps_release(struct inode *inode);
815extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
816extern int ceph_fsync(struct file *file, struct dentry *dentry, int datasync);
817extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
818 struct ceph_mds_session *session);
819extern int ceph_get_cap_mds(struct inode *inode);
820extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
821extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
822extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
823 struct ceph_snap_context *snapc);
824extern void __ceph_flush_snaps(struct ceph_inode_info *ci,
825 struct ceph_mds_session **psession);
826extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
827 struct ceph_mds_session *session);
828extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
829extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc);
830
831extern int ceph_encode_inode_release(void **p, struct inode *inode,
832 int mds, int drop, int unless, int force);
833extern int ceph_encode_dentry_release(void **p, struct dentry *dn,
834 int mds, int drop, int unless);
835
836extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
837 int *got, loff_t endoff);
838
839/* for counting open files by mode */
840static inline void __ceph_get_fmode(struct ceph_inode_info *ci, int mode)
841{
842 ci->i_nr_by_mode[mode]++;
843}
844extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
845
846/* addr.c */
847extern const struct address_space_operations ceph_aops;
848extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
849
850/* file.c */
851extern const struct file_operations ceph_file_fops;
852extern const struct address_space_operations ceph_aops;
853extern int ceph_open(struct inode *inode, struct file *file);
854extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
855 struct nameidata *nd, int mode,
856 int locked_dir);
857extern int ceph_release(struct inode *inode, struct file *filp);
858extern void ceph_release_page_vector(struct page **pages, int num_pages);
859
860/* dir.c */
861extern const struct file_operations ceph_dir_fops;
862extern const struct inode_operations ceph_dir_iops;
863extern struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
864 ceph_snapdir_dentry_ops;
865
866extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
867extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
868 struct dentry *dentry, int err);
869
870extern void ceph_dentry_lru_add(struct dentry *dn);
871extern void ceph_dentry_lru_touch(struct dentry *dn);
872extern void ceph_dentry_lru_del(struct dentry *dn);
873
874/*
875 * our d_ops vary depending on whether the inode is live,
876 * snapshotted (read-only), or a virtual ".snap" directory.
877 */
878int ceph_init_dentry(struct dentry *dentry);
879
880
881/* ioctl.c */
882extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
883
884/* export.c */
885extern const struct export_operations ceph_export_ops;
886
887/* debugfs.c */
888extern int ceph_debugfs_init(void);
889extern void ceph_debugfs_cleanup(void);
890extern int ceph_debugfs_client_init(struct ceph_client *client);
891extern void ceph_debugfs_client_cleanup(struct ceph_client *client);
892
893static inline struct inode *get_dentry_parent_inode(struct dentry *dentry)
894{
895 if (dentry && dentry->d_parent)
896 return dentry->d_parent->d_inode;
897
898 return NULL;
899}
900
901#endif /* _FS_CEPH_SUPER_H */
diff --git a/fs/ceph/types.h b/fs/ceph/types.h
new file mode 100644
index 000000000000..28b35a005ec2
--- /dev/null
+++ b/fs/ceph/types.h
@@ -0,0 +1,29 @@
1#ifndef _FS_CEPH_TYPES_H
2#define _FS_CEPH_TYPES_H
3
4/* needed before including ceph_fs.h */
5#include <linux/in.h>
6#include <linux/types.h>
7#include <linux/fcntl.h>
8#include <linux/string.h>
9
10#include "ceph_fs.h"
11#include "ceph_frag.h"
12#include "ceph_hash.h"
13
14/*
15 * Identify inodes by both their ino AND snapshot id (a u64).
16 */
17struct ceph_vino {
18 u64 ino;
19 u64 snap;
20};
21
22
23/* context for the caps reservation mechanism */
24struct ceph_cap_reservation {
25 int count;
26};
27
28
29#endif
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
new file mode 100644
index 000000000000..37d6ce645691
--- /dev/null
+++ b/fs/ceph/xattr.c
@@ -0,0 +1,844 @@
1#include "ceph_debug.h"
2#include "super.h"
3#include "decode.h"
4
5#include <linux/xattr.h>
6
7static bool ceph_is_valid_xattr(const char *name)
8{
9 return !strncmp(name, XATTR_SECURITY_PREFIX,
10 XATTR_SECURITY_PREFIX_LEN) ||
11 !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
12 !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
13}
14
15/*
16 * These define virtual xattrs exposing the recursive directory
17 * statistics and layout metadata.
18 */
19struct ceph_vxattr_cb {
20 bool readonly;
21 char *name;
22 size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,
23 size_t size);
24};
25
26/* directories */
27
28static size_t ceph_vxattrcb_entries(struct ceph_inode_info *ci, char *val,
29 size_t size)
30{
31 return snprintf(val, size, "%lld", ci->i_files + ci->i_subdirs);
32}
33
34static size_t ceph_vxattrcb_files(struct ceph_inode_info *ci, char *val,
35 size_t size)
36{
37 return snprintf(val, size, "%lld", ci->i_files);
38}
39
40static size_t ceph_vxattrcb_subdirs(struct ceph_inode_info *ci, char *val,
41 size_t size)
42{
43 return snprintf(val, size, "%lld", ci->i_subdirs);
44}
45
46static size_t ceph_vxattrcb_rentries(struct ceph_inode_info *ci, char *val,
47 size_t size)
48{
49 return snprintf(val, size, "%lld", ci->i_rfiles + ci->i_rsubdirs);
50}
51
52static size_t ceph_vxattrcb_rfiles(struct ceph_inode_info *ci, char *val,
53 size_t size)
54{
55 return snprintf(val, size, "%lld", ci->i_rfiles);
56}
57
58static size_t ceph_vxattrcb_rsubdirs(struct ceph_inode_info *ci, char *val,
59 size_t size)
60{
61 return snprintf(val, size, "%lld", ci->i_rsubdirs);
62}
63
64static size_t ceph_vxattrcb_rbytes(struct ceph_inode_info *ci, char *val,
65 size_t size)
66{
67 return snprintf(val, size, "%lld", ci->i_rbytes);
68}
69
70static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val,
71 size_t size)
72{
73 return snprintf(val, size, "%ld.%ld", (long)ci->i_rctime.tv_sec,
74 (long)ci->i_rctime.tv_nsec);
75}
76
77static struct ceph_vxattr_cb ceph_dir_vxattrs[] = {
78 { true, "user.ceph.dir.entries", ceph_vxattrcb_entries},
79 { true, "user.ceph.dir.files", ceph_vxattrcb_files},
80 { true, "user.ceph.dir.subdirs", ceph_vxattrcb_subdirs},
81 { true, "user.ceph.dir.rentries", ceph_vxattrcb_rentries},
82 { true, "user.ceph.dir.rfiles", ceph_vxattrcb_rfiles},
83 { true, "user.ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs},
84 { true, "user.ceph.dir.rbytes", ceph_vxattrcb_rbytes},
85 { true, "user.ceph.dir.rctime", ceph_vxattrcb_rctime},
86 { true, NULL, NULL }
87};
88
89/* files */
90
91static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
92 size_t size)
93{
94 int ret;
95
96 ret = snprintf(val, size,
97 "chunk_bytes=%lld\nstripe_count=%lld\nobject_size=%lld\n",
98 (unsigned long long)ceph_file_layout_su(ci->i_layout),
99 (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
100 (unsigned long long)ceph_file_layout_object_size(ci->i_layout));
101 if (ceph_file_layout_pg_preferred(ci->i_layout))
102 ret += snprintf(val + ret, size, "preferred_osd=%lld\n",
103 (unsigned long long)ceph_file_layout_pg_preferred(
104 ci->i_layout));
105 return ret;
106}
107
108static struct ceph_vxattr_cb ceph_file_vxattrs[] = {
109 { true, "user.ceph.layout", ceph_vxattrcb_layout},
110 { NULL, NULL }
111};
112
113static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode)
114{
115 if (S_ISDIR(inode->i_mode))
116 return ceph_dir_vxattrs;
117 else if (S_ISREG(inode->i_mode))
118 return ceph_file_vxattrs;
119 return NULL;
120}
121
122static struct ceph_vxattr_cb *ceph_match_vxattr(struct ceph_vxattr_cb *vxattr,
123 const char *name)
124{
125 do {
126 if (strcmp(vxattr->name, name) == 0)
127 return vxattr;
128 vxattr++;
129 } while (vxattr->name);
130 return NULL;
131}
132
133static int __set_xattr(struct ceph_inode_info *ci,
134 const char *name, int name_len,
135 const char *val, int val_len,
136 int dirty,
137 int should_free_name, int should_free_val,
138 struct ceph_inode_xattr **newxattr)
139{
140 struct rb_node **p;
141 struct rb_node *parent = NULL;
142 struct ceph_inode_xattr *xattr = NULL;
143 int c;
144 int new = 0;
145
146 p = &ci->i_xattrs.index.rb_node;
147 while (*p) {
148 parent = *p;
149 xattr = rb_entry(parent, struct ceph_inode_xattr, node);
150 c = strncmp(name, xattr->name, min(name_len, xattr->name_len));
151 if (c < 0)
152 p = &(*p)->rb_left;
153 else if (c > 0)
154 p = &(*p)->rb_right;
155 else {
156 if (name_len == xattr->name_len)
157 break;
158 else if (name_len < xattr->name_len)
159 p = &(*p)->rb_left;
160 else
161 p = &(*p)->rb_right;
162 }
163 xattr = NULL;
164 }
165
166 if (!xattr) {
167 new = 1;
168 xattr = *newxattr;
169 xattr->name = name;
170 xattr->name_len = name_len;
171 xattr->should_free_name = should_free_name;
172
173 ci->i_xattrs.count++;
174 dout("__set_xattr count=%d\n", ci->i_xattrs.count);
175 } else {
176 kfree(*newxattr);
177 *newxattr = NULL;
178 if (xattr->should_free_val)
179 kfree((void *)xattr->val);
180
181 if (should_free_name) {
182 kfree((void *)name);
183 name = xattr->name;
184 }
185 ci->i_xattrs.names_size -= xattr->name_len;
186 ci->i_xattrs.vals_size -= xattr->val_len;
187 }
188 if (!xattr) {
189 pr_err("__set_xattr ENOMEM on %p %llx.%llx xattr %s=%s\n",
190 &ci->vfs_inode, ceph_vinop(&ci->vfs_inode), name,
191 xattr->val);
192 return -ENOMEM;
193 }
194 ci->i_xattrs.names_size += name_len;
195 ci->i_xattrs.vals_size += val_len;
196 if (val)
197 xattr->val = val;
198 else
199 xattr->val = "";
200
201 xattr->val_len = val_len;
202 xattr->dirty = dirty;
203 xattr->should_free_val = (val && should_free_val);
204
205 if (new) {
206 rb_link_node(&xattr->node, parent, p);
207 rb_insert_color(&xattr->node, &ci->i_xattrs.index);
208 dout("__set_xattr_val p=%p\n", p);
209 }
210
211 dout("__set_xattr_val added %llx.%llx xattr %p %s=%.*s\n",
212 ceph_vinop(&ci->vfs_inode), xattr, name, val_len, val);
213
214 return 0;
215}
216
217static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
218 const char *name)
219{
220 struct rb_node **p;
221 struct rb_node *parent = NULL;
222 struct ceph_inode_xattr *xattr = NULL;
223 int c;
224
225 p = &ci->i_xattrs.index.rb_node;
226 while (*p) {
227 parent = *p;
228 xattr = rb_entry(parent, struct ceph_inode_xattr, node);
229 c = strncmp(name, xattr->name, xattr->name_len);
230 if (c < 0)
231 p = &(*p)->rb_left;
232 else if (c > 0)
233 p = &(*p)->rb_right;
234 else {
235 dout("__get_xattr %s: found %.*s\n", name,
236 xattr->val_len, xattr->val);
237 return xattr;
238 }
239 }
240
241 dout("__get_xattr %s: not found\n", name);
242
243 return NULL;
244}
245
246static void __free_xattr(struct ceph_inode_xattr *xattr)
247{
248 BUG_ON(!xattr);
249
250 if (xattr->should_free_name)
251 kfree((void *)xattr->name);
252 if (xattr->should_free_val)
253 kfree((void *)xattr->val);
254
255 kfree(xattr);
256}
257
258static int __remove_xattr(struct ceph_inode_info *ci,
259 struct ceph_inode_xattr *xattr)
260{
261 if (!xattr)
262 return -EOPNOTSUPP;
263
264 rb_erase(&xattr->node, &ci->i_xattrs.index);
265
266 if (xattr->should_free_name)
267 kfree((void *)xattr->name);
268 if (xattr->should_free_val)
269 kfree((void *)xattr->val);
270
271 ci->i_xattrs.names_size -= xattr->name_len;
272 ci->i_xattrs.vals_size -= xattr->val_len;
273 ci->i_xattrs.count--;
274 kfree(xattr);
275
276 return 0;
277}
278
279static int __remove_xattr_by_name(struct ceph_inode_info *ci,
280 const char *name)
281{
282 struct rb_node **p;
283 struct ceph_inode_xattr *xattr;
284 int err;
285
286 p = &ci->i_xattrs.index.rb_node;
287 xattr = __get_xattr(ci, name);
288 err = __remove_xattr(ci, xattr);
289 return err;
290}
291
292static char *__copy_xattr_names(struct ceph_inode_info *ci,
293 char *dest)
294{
295 struct rb_node *p;
296 struct ceph_inode_xattr *xattr = NULL;
297
298 p = rb_first(&ci->i_xattrs.index);
299 dout("__copy_xattr_names count=%d\n", ci->i_xattrs.count);
300
301 while (p) {
302 xattr = rb_entry(p, struct ceph_inode_xattr, node);
303 memcpy(dest, xattr->name, xattr->name_len);
304 dest[xattr->name_len] = '\0';
305
306 dout("dest=%s %p (%s) (%d/%d)\n", dest, xattr, xattr->name,
307 xattr->name_len, ci->i_xattrs.names_size);
308
309 dest += xattr->name_len + 1;
310 p = rb_next(p);
311 }
312
313 return dest;
314}
315
316void __ceph_destroy_xattrs(struct ceph_inode_info *ci)
317{
318 struct rb_node *p, *tmp;
319 struct ceph_inode_xattr *xattr = NULL;
320
321 p = rb_first(&ci->i_xattrs.index);
322
323 dout("__ceph_destroy_xattrs p=%p\n", p);
324
325 while (p) {
326 xattr = rb_entry(p, struct ceph_inode_xattr, node);
327 tmp = p;
328 p = rb_next(tmp);
329 dout("__ceph_destroy_xattrs next p=%p (%.*s)\n", p,
330 xattr->name_len, xattr->name);
331 rb_erase(tmp, &ci->i_xattrs.index);
332
333 __free_xattr(xattr);
334 }
335
336 ci->i_xattrs.names_size = 0;
337 ci->i_xattrs.vals_size = 0;
338 ci->i_xattrs.index_version = 0;
339 ci->i_xattrs.count = 0;
340 ci->i_xattrs.index = RB_ROOT;
341}
342
343static int __build_xattrs(struct inode *inode)
344{
345 u32 namelen;
346 u32 numattr = 0;
347 void *p, *end;
348 u32 len;
349 const char *name, *val;
350 struct ceph_inode_info *ci = ceph_inode(inode);
351 int xattr_version;
352 struct ceph_inode_xattr **xattrs = NULL;
353 int err = 0;
354 int i;
355
356 dout("__build_xattrs() len=%d\n",
357 ci->i_xattrs.blob ? (int)ci->i_xattrs.blob->vec.iov_len : 0);
358
359 if (ci->i_xattrs.index_version >= ci->i_xattrs.version)
360 return 0; /* already built */
361
362 __ceph_destroy_xattrs(ci);
363
364start:
365 /* updated internal xattr rb tree */
366 if (ci->i_xattrs.blob && ci->i_xattrs.blob->vec.iov_len > 4) {
367 p = ci->i_xattrs.blob->vec.iov_base;
368 end = p + ci->i_xattrs.blob->vec.iov_len;
369 ceph_decode_32_safe(&p, end, numattr, bad);
370 xattr_version = ci->i_xattrs.version;
371 spin_unlock(&inode->i_lock);
372
373 xattrs = kcalloc(numattr, sizeof(struct ceph_xattr *),
374 GFP_NOFS);
375 err = -ENOMEM;
376 if (!xattrs)
377 goto bad_lock;
378 memset(xattrs, 0, numattr*sizeof(struct ceph_xattr *));
379 for (i = 0; i < numattr; i++) {
380 xattrs[i] = kmalloc(sizeof(struct ceph_inode_xattr),
381 GFP_NOFS);
382 if (!xattrs[i])
383 goto bad_lock;
384 }
385
386 spin_lock(&inode->i_lock);
387 if (ci->i_xattrs.version != xattr_version) {
388 /* lost a race, retry */
389 for (i = 0; i < numattr; i++)
390 kfree(xattrs[i]);
391 kfree(xattrs);
392 goto start;
393 }
394 err = -EIO;
395 while (numattr--) {
396 ceph_decode_32_safe(&p, end, len, bad);
397 namelen = len;
398 name = p;
399 p += len;
400 ceph_decode_32_safe(&p, end, len, bad);
401 val = p;
402 p += len;
403
404 err = __set_xattr(ci, name, namelen, val, len,
405 0, 0, 0, &xattrs[numattr]);
406
407 if (err < 0)
408 goto bad;
409 }
410 kfree(xattrs);
411 }
412 ci->i_xattrs.index_version = ci->i_xattrs.version;
413 ci->i_xattrs.dirty = false;
414
415 return err;
416bad_lock:
417 spin_lock(&inode->i_lock);
418bad:
419 if (xattrs) {
420 for (i = 0; i < numattr; i++)
421 kfree(xattrs[i]);
422 kfree(xattrs);
423 }
424 ci->i_xattrs.names_size = 0;
425 return err;
426}
427
428static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size,
429 int val_size)
430{
431 /*
432 * 4 bytes for the length, and additional 4 bytes per each xattr name,
433 * 4 bytes per each value
434 */
435 int size = 4 + ci->i_xattrs.count*(4 + 4) +
436 ci->i_xattrs.names_size +
437 ci->i_xattrs.vals_size;
438 dout("__get_required_blob_size c=%d names.size=%d vals.size=%d\n",
439 ci->i_xattrs.count, ci->i_xattrs.names_size,
440 ci->i_xattrs.vals_size);
441
442 if (name_size)
443 size += 4 + 4 + name_size + val_size;
444
445 return size;
446}
447
448/*
449 * If there are dirty xattrs, reencode xattrs into the prealloc_blob
450 * and swap into place.
451 */
452void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
453{
454 struct rb_node *p;
455 struct ceph_inode_xattr *xattr = NULL;
456 void *dest;
457
458 dout("__build_xattrs_blob %p\n", &ci->vfs_inode);
459 if (ci->i_xattrs.dirty) {
460 int need = __get_required_blob_size(ci, 0, 0);
461
462 BUG_ON(need > ci->i_xattrs.prealloc_blob->alloc_len);
463
464 p = rb_first(&ci->i_xattrs.index);
465 dest = ci->i_xattrs.prealloc_blob->vec.iov_base;
466
467 ceph_encode_32(&dest, ci->i_xattrs.count);
468 while (p) {
469 xattr = rb_entry(p, struct ceph_inode_xattr, node);
470
471 ceph_encode_32(&dest, xattr->name_len);
472 memcpy(dest, xattr->name, xattr->name_len);
473 dest += xattr->name_len;
474 ceph_encode_32(&dest, xattr->val_len);
475 memcpy(dest, xattr->val, xattr->val_len);
476 dest += xattr->val_len;
477
478 p = rb_next(p);
479 }
480
481 /* adjust buffer len; it may be larger than we need */
482 ci->i_xattrs.prealloc_blob->vec.iov_len =
483 dest - ci->i_xattrs.prealloc_blob->vec.iov_base;
484
485 if (ci->i_xattrs.blob)
486 ceph_buffer_put(ci->i_xattrs.blob);
487 ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob;
488 ci->i_xattrs.prealloc_blob = NULL;
489 ci->i_xattrs.dirty = false;
490 }
491}
492
493ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
494 size_t size)
495{
496 struct inode *inode = dentry->d_inode;
497 struct ceph_inode_info *ci = ceph_inode(inode);
498 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
499 int err;
500 struct ceph_inode_xattr *xattr;
501 struct ceph_vxattr_cb *vxattr = NULL;
502
503 if (!ceph_is_valid_xattr(name))
504 return -ENODATA;
505
506 /* let's see if a virtual xattr was requested */
507 if (vxattrs)
508 vxattr = ceph_match_vxattr(vxattrs, name);
509
510 spin_lock(&inode->i_lock);
511 dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
512 ci->i_xattrs.version, ci->i_xattrs.index_version);
513
514 if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
515 (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
516 goto get_xattr;
517 } else {
518 spin_unlock(&inode->i_lock);
519 /* get xattrs from mds (if we don't already have them) */
520 err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
521 if (err)
522 return err;
523 }
524
525 spin_lock(&inode->i_lock);
526
527 if (vxattr && vxattr->readonly) {
528 err = vxattr->getxattr_cb(ci, value, size);
529 goto out;
530 }
531
532 err = __build_xattrs(inode);
533 if (err < 0)
534 goto out;
535
536get_xattr:
537 err = -ENODATA; /* == ENOATTR */
538 xattr = __get_xattr(ci, name);
539 if (!xattr) {
540 if (vxattr)
541 err = vxattr->getxattr_cb(ci, value, size);
542 goto out;
543 }
544
545 err = -ERANGE;
546 if (size && size < xattr->val_len)
547 goto out;
548
549 err = xattr->val_len;
550 if (size == 0)
551 goto out;
552
553 memcpy(value, xattr->val, xattr->val_len);
554
555out:
556 spin_unlock(&inode->i_lock);
557 return err;
558}
559
560ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
561{
562 struct inode *inode = dentry->d_inode;
563 struct ceph_inode_info *ci = ceph_inode(inode);
564 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
565 u32 vir_namelen = 0;
566 u32 namelen;
567 int err;
568 u32 len;
569 int i;
570
571 spin_lock(&inode->i_lock);
572 dout("listxattr %p ver=%lld index_ver=%lld\n", inode,
573 ci->i_xattrs.version, ci->i_xattrs.index_version);
574
575 if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
576 (ci->i_xattrs.index_version > ci->i_xattrs.version)) {
577 goto list_xattr;
578 } else {
579 spin_unlock(&inode->i_lock);
580 err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
581 if (err)
582 return err;
583 }
584
585 spin_lock(&inode->i_lock);
586
587 err = __build_xattrs(inode);
588 if (err < 0)
589 goto out;
590
591list_xattr:
592 vir_namelen = 0;
593 /* include virtual dir xattrs */
594 if (vxattrs)
595 for (i = 0; vxattrs[i].name; i++)
596 vir_namelen += strlen(vxattrs[i].name) + 1;
597 /* adding 1 byte per each variable due to the null termination */
598 namelen = vir_namelen + ci->i_xattrs.names_size + ci->i_xattrs.count;
599 err = -ERANGE;
600 if (size && namelen > size)
601 goto out;
602
603 err = namelen;
604 if (size == 0)
605 goto out;
606
607 names = __copy_xattr_names(ci, names);
608
609 /* virtual xattr names, too */
610 if (vxattrs)
611 for (i = 0; vxattrs[i].name; i++) {
612 len = sprintf(names, "%s", vxattrs[i].name);
613 names += len + 1;
614 }
615
616out:
617 spin_unlock(&inode->i_lock);
618 return err;
619}
620
621static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
622 const char *value, size_t size, int flags)
623{
624 struct ceph_client *client = ceph_client(dentry->d_sb);
625 struct inode *inode = dentry->d_inode;
626 struct ceph_inode_info *ci = ceph_inode(inode);
627 struct inode *parent_inode = dentry->d_parent->d_inode;
628 struct ceph_mds_request *req;
629 struct ceph_mds_client *mdsc = &client->mdsc;
630 int err;
631 int i, nr_pages;
632 struct page **pages = NULL;
633 void *kaddr;
634
635 /* copy value into some pages */
636 nr_pages = calc_pages_for(0, size);
637 if (nr_pages) {
638 pages = kmalloc(sizeof(pages[0])*nr_pages, GFP_NOFS);
639 if (!pages)
640 return -ENOMEM;
641 err = -ENOMEM;
642 for (i = 0; i < nr_pages; i++) {
643 pages[i] = alloc_page(GFP_NOFS);
644 if (!pages[i]) {
645 nr_pages = i;
646 goto out;
647 }
648 kaddr = kmap(pages[i]);
649 memcpy(kaddr, value + i*PAGE_CACHE_SIZE,
650 min(PAGE_CACHE_SIZE, size-i*PAGE_CACHE_SIZE));
651 }
652 }
653
654 dout("setxattr value=%.*s\n", (int)size, value);
655
656 /* do request */
657 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR,
658 USE_AUTH_MDS);
659 if (IS_ERR(req)) {
660 err = PTR_ERR(req);
661 goto out;
662 }
663 req->r_inode = igrab(inode);
664 req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
665 req->r_num_caps = 1;
666 req->r_args.setxattr.flags = cpu_to_le32(flags);
667 req->r_path2 = kstrdup(name, GFP_NOFS);
668
669 req->r_pages = pages;
670 req->r_num_pages = nr_pages;
671 req->r_data_len = size;
672
673 dout("xattr.ver (before): %lld\n", ci->i_xattrs.version);
674 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
675 ceph_mdsc_put_request(req);
676 dout("xattr.ver (after): %lld\n", ci->i_xattrs.version);
677
678out:
679 if (pages) {
680 for (i = 0; i < nr_pages; i++)
681 __free_page(pages[i]);
682 kfree(pages);
683 }
684 return err;
685}
686
687int ceph_setxattr(struct dentry *dentry, const char *name,
688 const void *value, size_t size, int flags)
689{
690 struct inode *inode = dentry->d_inode;
691 struct ceph_inode_info *ci = ceph_inode(inode);
692 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
693 int err;
694 int name_len = strlen(name);
695 int val_len = size;
696 char *newname = NULL;
697 char *newval = NULL;
698 struct ceph_inode_xattr *xattr = NULL;
699 int issued;
700 int required_blob_size;
701
702 if (ceph_snap(inode) != CEPH_NOSNAP)
703 return -EROFS;
704
705 if (!ceph_is_valid_xattr(name))
706 return -EOPNOTSUPP;
707
708 if (vxattrs) {
709 struct ceph_vxattr_cb *vxattr =
710 ceph_match_vxattr(vxattrs, name);
711 if (vxattr && vxattr->readonly)
712 return -EOPNOTSUPP;
713 }
714
715 /* preallocate memory for xattr name, value, index node */
716 err = -ENOMEM;
717 newname = kmalloc(name_len + 1, GFP_NOFS);
718 if (!newname)
719 goto out;
720 memcpy(newname, name, name_len + 1);
721
722 if (val_len) {
723 newval = kmalloc(val_len + 1, GFP_NOFS);
724 if (!newval)
725 goto out;
726 memcpy(newval, value, val_len);
727 newval[val_len] = '\0';
728 }
729
730 xattr = kmalloc(sizeof(struct ceph_inode_xattr), GFP_NOFS);
731 if (!xattr)
732 goto out;
733
734 spin_lock(&inode->i_lock);
735retry:
736 issued = __ceph_caps_issued(ci, NULL);
737 if (!(issued & CEPH_CAP_XATTR_EXCL))
738 goto do_sync;
739 __build_xattrs(inode);
740
741 required_blob_size = __get_required_blob_size(ci, name_len, val_len);
742
743 if (!ci->i_xattrs.prealloc_blob ||
744 required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
745 struct ceph_buffer *blob = NULL;
746
747 spin_unlock(&inode->i_lock);
748 dout(" preaallocating new blob size=%d\n", required_blob_size);
749 blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
750 if (!blob)
751 goto out;
752 spin_lock(&inode->i_lock);
753 if (ci->i_xattrs.prealloc_blob)
754 ceph_buffer_put(ci->i_xattrs.prealloc_blob);
755 ci->i_xattrs.prealloc_blob = blob;
756 goto retry;
757 }
758
759 dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
760 err = __set_xattr(ci, newname, name_len, newval,
761 val_len, 1, 1, 1, &xattr);
762 __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
763 ci->i_xattrs.dirty = true;
764 inode->i_ctime = CURRENT_TIME;
765 spin_unlock(&inode->i_lock);
766
767 return err;
768
769do_sync:
770 spin_unlock(&inode->i_lock);
771 err = ceph_sync_setxattr(dentry, name, value, size, flags);
772out:
773 kfree(newname);
774 kfree(newval);
775 kfree(xattr);
776 return err;
777}
778
779static int ceph_send_removexattr(struct dentry *dentry, const char *name)
780{
781 struct ceph_client *client = ceph_client(dentry->d_sb);
782 struct ceph_mds_client *mdsc = &client->mdsc;
783 struct inode *inode = dentry->d_inode;
784 struct inode *parent_inode = dentry->d_parent->d_inode;
785 struct ceph_mds_request *req;
786 int err;
787
788 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RMXATTR,
789 USE_AUTH_MDS);
790 if (IS_ERR(req))
791 return PTR_ERR(req);
792 req->r_inode = igrab(inode);
793 req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
794 req->r_num_caps = 1;
795 req->r_path2 = kstrdup(name, GFP_NOFS);
796
797 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
798 ceph_mdsc_put_request(req);
799 return err;
800}
801
802int ceph_removexattr(struct dentry *dentry, const char *name)
803{
804 struct inode *inode = dentry->d_inode;
805 struct ceph_inode_info *ci = ceph_inode(inode);
806 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
807 int issued;
808 int err;
809
810 if (ceph_snap(inode) != CEPH_NOSNAP)
811 return -EROFS;
812
813 if (!ceph_is_valid_xattr(name))
814 return -EOPNOTSUPP;
815
816 if (vxattrs) {
817 struct ceph_vxattr_cb *vxattr =
818 ceph_match_vxattr(vxattrs, name);
819 if (vxattr && vxattr->readonly)
820 return -EOPNOTSUPP;
821 }
822
823 spin_lock(&inode->i_lock);
824 __build_xattrs(inode);
825 issued = __ceph_caps_issued(ci, NULL);
826 dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
827
828 if (!(issued & CEPH_CAP_XATTR_EXCL))
829 goto do_sync;
830
831 err = __remove_xattr_by_name(ceph_inode(inode), name);
832 __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
833 ci->i_xattrs.dirty = true;
834 inode->i_ctime = CURRENT_TIME;
835
836 spin_unlock(&inode->i_lock);
837
838 return err;
839do_sync:
840 spin_unlock(&inode->i_lock);
841 err = ceph_send_removexattr(dentry, name);
842 return err;
843}
844
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 8c6a03627176..5183bc2a1916 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -312,6 +312,7 @@ cifs_alloc_inode(struct super_block *sb)
312 cifs_inode->clientCanCacheRead = false; 312 cifs_inode->clientCanCacheRead = false;
313 cifs_inode->clientCanCacheAll = false; 313 cifs_inode->clientCanCacheAll = false;
314 cifs_inode->delete_pending = false; 314 cifs_inode->delete_pending = false;
315 cifs_inode->invalid_mapping = false;
315 cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ 316 cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */
316 cifs_inode->server_eof = 0; 317 cifs_inode->server_eof = 0;
317 318
@@ -638,7 +639,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
638 setting the revalidate time to zero */ 639 setting the revalidate time to zero */
639 CIFS_I(file->f_path.dentry->d_inode)->time = 0; 640 CIFS_I(file->f_path.dentry->d_inode)->time = 0;
640 641
641 retval = cifs_revalidate(file->f_path.dentry); 642 retval = cifs_revalidate_file(file);
642 if (retval < 0) 643 if (retval < 0)
643 return (loff_t)retval; 644 return (loff_t)retval;
644 } 645 }
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 78c1b86d55f6..7aa57ecdc437 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -61,7 +61,8 @@ extern int cifs_mkdir(struct inode *, struct dentry *, int);
61extern int cifs_rmdir(struct inode *, struct dentry *); 61extern int cifs_rmdir(struct inode *, struct dentry *);
62extern int cifs_rename(struct inode *, struct dentry *, struct inode *, 62extern int cifs_rename(struct inode *, struct dentry *, struct inode *,
63 struct dentry *); 63 struct dentry *);
64extern int cifs_revalidate(struct dentry *); 64extern int cifs_revalidate_file(struct file *filp);
65extern int cifs_revalidate_dentry(struct dentry *);
65extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *); 66extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
66extern int cifs_setattr(struct dentry *, struct iattr *); 67extern int cifs_setattr(struct dentry *, struct iattr *);
67 68
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index a1c817eb291a..63c89d1d70b5 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -389,6 +389,7 @@ struct cifsInodeInfo {
389 bool clientCanCacheRead:1; /* read oplock */ 389 bool clientCanCacheRead:1; /* read oplock */
390 bool clientCanCacheAll:1; /* read and writebehind oplock */ 390 bool clientCanCacheAll:1; /* read and writebehind oplock */
391 bool delete_pending:1; /* DELETE_ON_CLOSE is set */ 391 bool delete_pending:1; /* DELETE_ON_CLOSE is set */
392 bool invalid_mapping:1; /* pagecache is invalid */
392 u64 server_eof; /* current file size on server */ 393 u64 server_eof; /* current file size on server */
393 u64 uniqueid; /* server inode number */ 394 u64 uniqueid; /* server inode number */
394 struct inode vfs_inode; 395 struct inode vfs_inode;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 88e2bc44ac58..39e47f46dea5 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -104,10 +104,12 @@ extern void cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr);
104extern struct inode *cifs_iget(struct super_block *sb, 104extern struct inode *cifs_iget(struct super_block *sb,
105 struct cifs_fattr *fattr); 105 struct cifs_fattr *fattr);
106 106
107extern int cifs_get_file_info(struct file *filp);
107extern int cifs_get_inode_info(struct inode **pinode, 108extern int cifs_get_inode_info(struct inode **pinode,
108 const unsigned char *search_path, 109 const unsigned char *search_path,
109 FILE_ALL_INFO *pfile_info, 110 FILE_ALL_INFO *pfile_info,
110 struct super_block *sb, int xid, const __u16 *pfid); 111 struct super_block *sb, int xid, const __u16 *pfid);
112extern int cifs_get_file_info_unix(struct file *filp);
111extern int cifs_get_inode_info_unix(struct inode **pinode, 113extern int cifs_get_inode_info_unix(struct inode **pinode,
112 const unsigned char *search_path, 114 const unsigned char *search_path,
113 struct super_block *sb, int xid); 115 struct super_block *sb, int xid);
@@ -142,6 +144,8 @@ extern int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
142extern int CIFSFindClose(const int, struct cifsTconInfo *tcon, 144extern int CIFSFindClose(const int, struct cifsTconInfo *tcon,
143 const __u16 search_handle); 145 const __u16 search_handle);
144 146
147extern int CIFSSMBQFileInfo(const int xid, struct cifsTconInfo *tcon,
148 u16 netfid, FILE_ALL_INFO *pFindData);
145extern int CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon, 149extern int CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon,
146 const unsigned char *searchName, 150 const unsigned char *searchName,
147 FILE_ALL_INFO *findData, 151 FILE_ALL_INFO *findData,
@@ -152,6 +156,8 @@ extern int SMBQueryInformation(const int xid, struct cifsTconInfo *tcon,
152 FILE_ALL_INFO *findData, 156 FILE_ALL_INFO *findData,
153 const struct nls_table *nls_codepage, int remap); 157 const struct nls_table *nls_codepage, int remap);
154 158
159extern int CIFSSMBUnixQFileInfo(const int xid, struct cifsTconInfo *tcon,
160 u16 netfid, FILE_UNIX_BASIC_INFO *pFindData);
155extern int CIFSSMBUnixQPathInfo(const int xid, 161extern int CIFSSMBUnixQPathInfo(const int xid,
156 struct cifsTconInfo *tcon, 162 struct cifsTconInfo *tcon,
157 const unsigned char *searchName, 163 const unsigned char *searchName,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 611835899844..7cc7f83e9314 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -500,7 +500,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
500 } else if (pSMBr->hdr.WordCount == 13) { 500 } else if (pSMBr->hdr.WordCount == 13) {
501 cERROR(1, ("mount failed, cifs module not built " 501 cERROR(1, ("mount failed, cifs module not built "
502 "with CIFS_WEAK_PW_HASH support")); 502 "with CIFS_WEAK_PW_HASH support"));
503 rc = -EOPNOTSUPP; 503 rc = -EOPNOTSUPP;
504#endif /* WEAK_PW_HASH */ 504#endif /* WEAK_PW_HASH */
505 goto neg_err_exit; 505 goto neg_err_exit;
506 } else if (pSMBr->hdr.WordCount != 17) { 506 } else if (pSMBr->hdr.WordCount != 17) {
@@ -3230,8 +3230,72 @@ QInfRetry:
3230 return rc; 3230 return rc;
3231} 3231}
3232 3232
3233int
3234CIFSSMBQFileInfo(const int xid, struct cifsTconInfo *tcon,
3235 u16 netfid, FILE_ALL_INFO *pFindData)
3236{
3237 struct smb_t2_qfi_req *pSMB = NULL;
3238 struct smb_t2_qfi_rsp *pSMBr = NULL;
3239 int rc = 0;
3240 int bytes_returned;
3241 __u16 params, byte_count;
3233 3242
3243QFileInfoRetry:
3244 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
3245 (void **) &pSMBr);
3246 if (rc)
3247 return rc;
3234 3248
3249 params = 2 /* level */ + 2 /* fid */;
3250 pSMB->t2.TotalDataCount = 0;
3251 pSMB->t2.MaxParameterCount = cpu_to_le16(4);
3252 /* BB find exact max data count below from sess structure BB */
3253 pSMB->t2.MaxDataCount = cpu_to_le16(CIFSMaxBufSize);
3254 pSMB->t2.MaxSetupCount = 0;
3255 pSMB->t2.Reserved = 0;
3256 pSMB->t2.Flags = 0;
3257 pSMB->t2.Timeout = 0;
3258 pSMB->t2.Reserved2 = 0;
3259 pSMB->t2.ParameterOffset = cpu_to_le16(offsetof(struct smb_t2_qfi_req,
3260 Fid) - 4);
3261 pSMB->t2.DataCount = 0;
3262 pSMB->t2.DataOffset = 0;
3263 pSMB->t2.SetupCount = 1;
3264 pSMB->t2.Reserved3 = 0;
3265 pSMB->t2.SubCommand = cpu_to_le16(TRANS2_QUERY_FILE_INFORMATION);
3266 byte_count = params + 1 /* pad */ ;
3267 pSMB->t2.TotalParameterCount = cpu_to_le16(params);
3268 pSMB->t2.ParameterCount = pSMB->t2.TotalParameterCount;
3269 pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_ALL_INFO);
3270 pSMB->Pad = 0;
3271 pSMB->Fid = netfid;
3272 pSMB->hdr.smb_buf_length += byte_count;
3273
3274 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
3275 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
3276 if (rc) {
3277 cFYI(1, ("Send error in QPathInfo = %d", rc));
3278 } else { /* decode response */
3279 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
3280
3281 if (rc) /* BB add auto retry on EOPNOTSUPP? */
3282 rc = -EIO;
3283 else if (pSMBr->ByteCount < 40)
3284 rc = -EIO; /* bad smb */
3285 else if (pFindData) {
3286 __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
3287 memcpy((char *) pFindData,
3288 (char *) &pSMBr->hdr.Protocol +
3289 data_offset, sizeof(FILE_ALL_INFO));
3290 } else
3291 rc = -ENOMEM;
3292 }
3293 cifs_buf_release(pSMB);
3294 if (rc == -EAGAIN)
3295 goto QFileInfoRetry;
3296
3297 return rc;
3298}
3235 3299
3236int 3300int
3237CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon, 3301CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon,
@@ -3335,6 +3399,75 @@ QPathInfoRetry:
3335} 3399}
3336 3400
3337int 3401int
3402CIFSSMBUnixQFileInfo(const int xid, struct cifsTconInfo *tcon,
3403 u16 netfid, FILE_UNIX_BASIC_INFO *pFindData)
3404{
3405 struct smb_t2_qfi_req *pSMB = NULL;
3406 struct smb_t2_qfi_rsp *pSMBr = NULL;
3407 int rc = 0;
3408 int bytes_returned;
3409 __u16 params, byte_count;
3410
3411UnixQFileInfoRetry:
3412 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
3413 (void **) &pSMBr);
3414 if (rc)
3415 return rc;
3416
3417 params = 2 /* level */ + 2 /* fid */;
3418 pSMB->t2.TotalDataCount = 0;
3419 pSMB->t2.MaxParameterCount = cpu_to_le16(4);
3420 /* BB find exact max data count below from sess structure BB */
3421 pSMB->t2.MaxDataCount = cpu_to_le16(CIFSMaxBufSize);
3422 pSMB->t2.MaxSetupCount = 0;
3423 pSMB->t2.Reserved = 0;
3424 pSMB->t2.Flags = 0;
3425 pSMB->t2.Timeout = 0;
3426 pSMB->t2.Reserved2 = 0;
3427 pSMB->t2.ParameterOffset = cpu_to_le16(offsetof(struct smb_t2_qfi_req,
3428 Fid) - 4);
3429 pSMB->t2.DataCount = 0;
3430 pSMB->t2.DataOffset = 0;
3431 pSMB->t2.SetupCount = 1;
3432 pSMB->t2.Reserved3 = 0;
3433 pSMB->t2.SubCommand = cpu_to_le16(TRANS2_QUERY_FILE_INFORMATION);
3434 byte_count = params + 1 /* pad */ ;
3435 pSMB->t2.TotalParameterCount = cpu_to_le16(params);
3436 pSMB->t2.ParameterCount = pSMB->t2.TotalParameterCount;
3437 pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC);
3438 pSMB->Pad = 0;
3439 pSMB->Fid = netfid;
3440 pSMB->hdr.smb_buf_length += byte_count;
3441
3442 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
3443 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
3444 if (rc) {
3445 cFYI(1, ("Send error in QPathInfo = %d", rc));
3446 } else { /* decode response */
3447 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
3448
3449 if (rc || (pSMBr->ByteCount < sizeof(FILE_UNIX_BASIC_INFO))) {
3450 cERROR(1, ("Malformed FILE_UNIX_BASIC_INFO response.\n"
3451 "Unix Extensions can be disabled on mount "
3452 "by specifying the nosfu mount option."));
3453 rc = -EIO; /* bad smb */
3454 } else {
3455 __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
3456 memcpy((char *) pFindData,
3457 (char *) &pSMBr->hdr.Protocol +
3458 data_offset,
3459 sizeof(FILE_UNIX_BASIC_INFO));
3460 }
3461 }
3462
3463 cifs_buf_release(pSMB);
3464 if (rc == -EAGAIN)
3465 goto UnixQFileInfoRetry;
3466
3467 return rc;
3468}
3469
3470int
3338CIFSSMBUnixQPathInfo(const int xid, struct cifsTconInfo *tcon, 3471CIFSSMBUnixQPathInfo(const int xid, struct cifsTconInfo *tcon,
3339 const unsigned char *searchName, 3472 const unsigned char *searchName,
3340 FILE_UNIX_BASIC_INFO *pFindData, 3473 FILE_UNIX_BASIC_INFO *pFindData,
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 6ccf7262d1b7..e9f7ecc2714b 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -739,7 +739,7 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
739 int isValid = 1; 739 int isValid = 1;
740 740
741 if (direntry->d_inode) { 741 if (direntry->d_inode) {
742 if (cifs_revalidate(direntry)) 742 if (cifs_revalidate_dentry(direntry))
743 return 0; 743 return 0;
744 } else { 744 } else {
745 cFYI(1, ("neg dentry 0x%p name = %s", 745 cFYI(1, ("neg dentry 0x%p name = %s",
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 3d8f8a96f5a3..ca2ba7a0193c 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -219,8 +219,8 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
219 cFYI(1, ("inode unchanged on server")); 219 cFYI(1, ("inode unchanged on server"));
220 } else { 220 } else {
221 if (file->f_path.dentry->d_inode->i_mapping) { 221 if (file->f_path.dentry->d_inode->i_mapping) {
222 /* BB no need to lock inode until after invalidate 222 /* BB no need to lock inode until after invalidate
223 since namei code should already have it locked? */ 223 since namei code should already have it locked? */
224 rc = filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping); 224 rc = filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping);
225 if (rc != 0) 225 if (rc != 0)
226 CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc; 226 CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc;
@@ -1890,11 +1890,10 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
1890 1890
1891int cifs_file_mmap(struct file *file, struct vm_area_struct *vma) 1891int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
1892{ 1892{
1893 struct dentry *dentry = file->f_path.dentry;
1894 int rc, xid; 1893 int rc, xid;
1895 1894
1896 xid = GetXid(); 1895 xid = GetXid();
1897 rc = cifs_revalidate(dentry); 1896 rc = cifs_revalidate_file(file);
1898 if (rc) { 1897 if (rc) {
1899 cFYI(1, ("Validation prior to mmap failed, error=%d", rc)); 1898 cFYI(1, ("Validation prior to mmap failed, error=%d", rc));
1900 FreeXid(xid); 1899 FreeXid(xid);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 8bdbc818164c..723daaccbd0e 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -77,6 +77,41 @@ static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
77 } 77 }
78} 78}
79 79
80/* check inode attributes against fattr. If they don't match, tag the
81 * inode for cache invalidation
82 */
83static void
84cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
85{
86 struct cifsInodeInfo *cifs_i = CIFS_I(inode);
87
88 cFYI(1, ("%s: revalidating inode %llu", __func__, cifs_i->uniqueid));
89
90 if (inode->i_state & I_NEW) {
91 cFYI(1, ("%s: inode %llu is new", __func__, cifs_i->uniqueid));
92 return;
93 }
94
95 /* don't bother with revalidation if we have an oplock */
96 if (cifs_i->clientCanCacheRead) {
97 cFYI(1, ("%s: inode %llu is oplocked", __func__,
98 cifs_i->uniqueid));
99 return;
100 }
101
102 /* revalidate if mtime or size have changed */
103 if (timespec_equal(&inode->i_mtime, &fattr->cf_mtime) &&
104 cifs_i->server_eof == fattr->cf_eof) {
105 cFYI(1, ("%s: inode %llu is unchanged", __func__,
106 cifs_i->uniqueid));
107 return;
108 }
109
110 cFYI(1, ("%s: invalidating inode %llu mapping", __func__,
111 cifs_i->uniqueid));
112 cifs_i->invalid_mapping = true;
113}
114
80/* populate an inode with info from a cifs_fattr struct */ 115/* populate an inode with info from a cifs_fattr struct */
81void 116void
82cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) 117cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
@@ -85,6 +120,8 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
85 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 120 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
86 unsigned long oldtime = cifs_i->time; 121 unsigned long oldtime = cifs_i->time;
87 122
123 cifs_revalidate_cache(inode, fattr);
124
88 inode->i_atime = fattr->cf_atime; 125 inode->i_atime = fattr->cf_atime;
89 inode->i_mtime = fattr->cf_mtime; 126 inode->i_mtime = fattr->cf_mtime;
90 inode->i_ctime = fattr->cf_ctime; 127 inode->i_ctime = fattr->cf_ctime;
@@ -231,6 +268,31 @@ cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb)
231 fattr->cf_flags |= CIFS_FATTR_DFS_REFERRAL; 268 fattr->cf_flags |= CIFS_FATTR_DFS_REFERRAL;
232} 269}
233 270
271int cifs_get_file_info_unix(struct file *filp)
272{
273 int rc;
274 int xid;
275 FILE_UNIX_BASIC_INFO find_data;
276 struct cifs_fattr fattr;
277 struct inode *inode = filp->f_path.dentry->d_inode;
278 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
279 struct cifsTconInfo *tcon = cifs_sb->tcon;
280 struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;
281
282 xid = GetXid();
283 rc = CIFSSMBUnixQFileInfo(xid, tcon, cfile->netfid, &find_data);
284 if (!rc) {
285 cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb);
286 } else if (rc == -EREMOTE) {
287 cifs_create_dfs_fattr(&fattr, inode->i_sb);
288 rc = 0;
289 }
290
291 cifs_fattr_to_inode(inode, &fattr);
292 FreeXid(xid);
293 return rc;
294}
295
234int cifs_get_inode_info_unix(struct inode **pinode, 296int cifs_get_inode_info_unix(struct inode **pinode,
235 const unsigned char *full_path, 297 const unsigned char *full_path,
236 struct super_block *sb, int xid) 298 struct super_block *sb, int xid)
@@ -432,6 +494,47 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
432 fattr->cf_gid = cifs_sb->mnt_gid; 494 fattr->cf_gid = cifs_sb->mnt_gid;
433} 495}
434 496
497int cifs_get_file_info(struct file *filp)
498{
499 int rc;
500 int xid;
501 FILE_ALL_INFO find_data;
502 struct cifs_fattr fattr;
503 struct inode *inode = filp->f_path.dentry->d_inode;
504 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
505 struct cifsTconInfo *tcon = cifs_sb->tcon;
506 struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;
507
508 xid = GetXid();
509 rc = CIFSSMBQFileInfo(xid, tcon, cfile->netfid, &find_data);
510 if (rc == -EOPNOTSUPP || rc == -EINVAL) {
511 /*
512 * FIXME: legacy server -- fall back to path-based call?
513 * for now, just skip revalidating and mark inode for
514 * immediate reval.
515 */
516 rc = 0;
517 CIFS_I(inode)->time = 0;
518 goto cgfi_exit;
519 } else if (rc == -EREMOTE) {
520 cifs_create_dfs_fattr(&fattr, inode->i_sb);
521 rc = 0;
522 } else if (rc)
523 goto cgfi_exit;
524
525 /*
526 * don't bother with SFU junk here -- just mark inode as needing
527 * revalidation.
528 */
529 cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false);
530 fattr.cf_uniqueid = CIFS_I(inode)->uniqueid;
531 fattr.cf_flags |= CIFS_FATTR_NEED_REVAL;
532 cifs_fattr_to_inode(inode, &fattr);
533cgfi_exit:
534 FreeXid(xid);
535 return rc;
536}
537
435int cifs_get_inode_info(struct inode **pinode, 538int cifs_get_inode_info(struct inode **pinode,
436 const unsigned char *full_path, FILE_ALL_INFO *pfindData, 539 const unsigned char *full_path, FILE_ALL_INFO *pfindData,
437 struct super_block *sb, int xid, const __u16 *pfid) 540 struct super_block *sb, int xid, const __u16 *pfid)
@@ -1389,135 +1492,103 @@ cifs_rename_exit:
1389 return rc; 1492 return rc;
1390} 1493}
1391 1494
1392int cifs_revalidate(struct dentry *direntry) 1495static bool
1496cifs_inode_needs_reval(struct inode *inode)
1393{ 1497{
1394 int xid; 1498 struct cifsInodeInfo *cifs_i = CIFS_I(inode);
1395 int rc = 0, wbrc = 0;
1396 char *full_path;
1397 struct cifs_sb_info *cifs_sb;
1398 struct cifsInodeInfo *cifsInode;
1399 loff_t local_size;
1400 struct timespec local_mtime;
1401 bool invalidate_inode = false;
1402 1499
1403 if (direntry->d_inode == NULL) 1500 if (cifs_i->clientCanCacheRead)
1404 return -ENOENT; 1501 return false;
1405 1502
1406 cifsInode = CIFS_I(direntry->d_inode); 1503 if (!lookupCacheEnabled)
1504 return true;
1407 1505
1408 if (cifsInode == NULL) 1506 if (cifs_i->time == 0)
1409 return -ENOENT; 1507 return true;
1410 1508
1411 /* no sense revalidating inode info on file that no one can write */ 1509 /* FIXME: the actimeo should be tunable */
1412 if (CIFS_I(direntry->d_inode)->clientCanCacheRead) 1510 if (time_after_eq(jiffies, cifs_i->time + HZ))
1413 return rc; 1511 return true;
1512
1513 return false;
1514}
1515
1516/* check invalid_mapping flag and zap the cache if it's set */
1517static void
1518cifs_invalidate_mapping(struct inode *inode)
1519{
1520 int rc;
1521 struct cifsInodeInfo *cifs_i = CIFS_I(inode);
1522
1523 cifs_i->invalid_mapping = false;
1524
1525 /* write back any cached data */
1526 if (inode->i_mapping && inode->i_mapping->nrpages != 0) {
1527 rc = filemap_write_and_wait(inode->i_mapping);
1528 if (rc)
1529 cifs_i->write_behind_rc = rc;
1530 }
1531 invalidate_remote_inode(inode);
1532}
1533
1534int cifs_revalidate_file(struct file *filp)
1535{
1536 int rc = 0;
1537 struct inode *inode = filp->f_path.dentry->d_inode;
1538
1539 if (!cifs_inode_needs_reval(inode))
1540 goto check_inval;
1541
1542 if (CIFS_SB(inode->i_sb)->tcon->unix_ext)
1543 rc = cifs_get_file_info_unix(filp);
1544 else
1545 rc = cifs_get_file_info(filp);
1546
1547check_inval:
1548 if (CIFS_I(inode)->invalid_mapping)
1549 cifs_invalidate_mapping(inode);
1550
1551 return rc;
1552}
1553
1554/* revalidate a dentry's inode attributes */
1555int cifs_revalidate_dentry(struct dentry *dentry)
1556{
1557 int xid;
1558 int rc = 0;
1559 char *full_path = NULL;
1560 struct inode *inode = dentry->d_inode;
1561 struct super_block *sb = dentry->d_sb;
1562
1563 if (inode == NULL)
1564 return -ENOENT;
1414 1565
1415 xid = GetXid(); 1566 xid = GetXid();
1416 1567
1417 cifs_sb = CIFS_SB(direntry->d_sb); 1568 if (!cifs_inode_needs_reval(inode))
1569 goto check_inval;
1418 1570
1419 /* can not safely grab the rename sem here if rename calls revalidate 1571 /* can not safely grab the rename sem here if rename calls revalidate
1420 since that would deadlock */ 1572 since that would deadlock */
1421 full_path = build_path_from_dentry(direntry); 1573 full_path = build_path_from_dentry(dentry);
1422 if (full_path == NULL) { 1574 if (full_path == NULL) {
1423 rc = -ENOMEM; 1575 rc = -ENOMEM;
1424 FreeXid(xid); 1576 goto check_inval;
1425 return rc;
1426 }
1427 cFYI(1, ("Revalidate: %s inode 0x%p count %d dentry: 0x%p d_time %ld "
1428 "jiffies %ld", full_path, direntry->d_inode,
1429 direntry->d_inode->i_count.counter, direntry,
1430 direntry->d_time, jiffies));
1431
1432 if (cifsInode->time == 0) {
1433 /* was set to zero previously to force revalidate */
1434 } else if (time_before(jiffies, cifsInode->time + HZ) &&
1435 lookupCacheEnabled) {
1436 if ((S_ISREG(direntry->d_inode->i_mode) == 0) ||
1437 (direntry->d_inode->i_nlink == 1)) {
1438 kfree(full_path);
1439 FreeXid(xid);
1440 return rc;
1441 } else {
1442 cFYI(1, ("Have to revalidate file due to hardlinks"));
1443 }
1444 }
1445
1446 /* save mtime and size */
1447 local_mtime = direntry->d_inode->i_mtime;
1448 local_size = direntry->d_inode->i_size;
1449
1450 if (cifs_sb->tcon->unix_ext) {
1451 rc = cifs_get_inode_info_unix(&direntry->d_inode, full_path,
1452 direntry->d_sb, xid);
1453 if (rc) {
1454 cFYI(1, ("error on getting revalidate info %d", rc));
1455/* if (rc != -ENOENT)
1456 rc = 0; */ /* BB should we cache info on
1457 certain errors? */
1458 }
1459 } else {
1460 rc = cifs_get_inode_info(&direntry->d_inode, full_path, NULL,
1461 direntry->d_sb, xid, NULL);
1462 if (rc) {
1463 cFYI(1, ("error on getting revalidate info %d", rc));
1464/* if (rc != -ENOENT)
1465 rc = 0; */ /* BB should we cache info on
1466 certain errors? */
1467 }
1468 } 1577 }
1469 /* should we remap certain errors, access denied?, to zero */
1470 1578
1471 /* if not oplocked, we invalidate inode pages if mtime or file size 1579 cFYI(1, ("Revalidate: %s inode 0x%p count %d dentry: 0x%p d_time %ld "
1472 had changed on server */ 1580 "jiffies %ld", full_path, inode, inode->i_count.counter,
1581 dentry, dentry->d_time, jiffies));
1473 1582
1474 if (timespec_equal(&local_mtime, &direntry->d_inode->i_mtime) && 1583 if (CIFS_SB(sb)->tcon->unix_ext)
1475 (local_size == direntry->d_inode->i_size)) { 1584 rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid);
1476 cFYI(1, ("cifs_revalidate - inode unchanged")); 1585 else
1477 } else { 1586 rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
1478 /* file may have changed on server */ 1587 xid, NULL);
1479 if (cifsInode->clientCanCacheRead) {
1480 /* no need to invalidate inode pages since we were the
1481 only ones who could have modified the file and the
1482 server copy is staler than ours */
1483 } else {
1484 invalidate_inode = true;
1485 }
1486 }
1487 1588
1488 /* can not grab this sem since kernel filesys locking documentation 1589check_inval:
1489 indicates i_mutex may be taken by the kernel on lookup and rename 1590 if (CIFS_I(inode)->invalid_mapping)
1490 which could deadlock if we grab the i_mutex here as well */ 1591 cifs_invalidate_mapping(inode);
1491/* mutex_lock(&direntry->d_inode->i_mutex);*/
1492 /* need to write out dirty pages here */
1493 if (direntry->d_inode->i_mapping) {
1494 /* do we need to lock inode until after invalidate completes
1495 below? */
1496 wbrc = filemap_fdatawrite(direntry->d_inode->i_mapping);
1497 if (wbrc)
1498 CIFS_I(direntry->d_inode)->write_behind_rc = wbrc;
1499 }
1500 if (invalidate_inode) {
1501 /* shrink_dcache not necessary now that cifs dentry ops
1502 are exported for negative dentries */
1503/* if (S_ISDIR(direntry->d_inode->i_mode))
1504 shrink_dcache_parent(direntry); */
1505 if (S_ISREG(direntry->d_inode->i_mode)) {
1506 if (direntry->d_inode->i_mapping) {
1507 wbrc = filemap_fdatawait(direntry->d_inode->i_mapping);
1508 if (wbrc)
1509 CIFS_I(direntry->d_inode)->write_behind_rc = wbrc;
1510 }
1511 /* may eventually have to do this for open files too */
1512 if (list_empty(&(cifsInode->openFileList))) {
1513 /* changed on server - flush read ahead pages */
1514 cFYI(1, ("Invalidating read ahead data on "
1515 "closed file"));
1516 invalidate_remote_inode(direntry->d_inode);
1517 }
1518 }
1519 }
1520/* mutex_unlock(&direntry->d_inode->i_mutex); */
1521 1592
1522 kfree(full_path); 1593 kfree(full_path);
1523 FreeXid(xid); 1594 FreeXid(xid);
@@ -1527,7 +1598,7 @@ int cifs_revalidate(struct dentry *direntry)
1527int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry, 1598int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
1528 struct kstat *stat) 1599 struct kstat *stat)
1529{ 1600{
1530 int err = cifs_revalidate(dentry); 1601 int err = cifs_revalidate_dentry(dentry);
1531 if (!err) { 1602 if (!err) {
1532 generic_fillattr(dentry->d_inode, stat); 1603 generic_fillattr(dentry->d_inode, stat);
1533 stat->blksize = CIFS_MAX_MSGSIZE; 1604 stat->blksize = CIFS_MAX_MSGSIZE;
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index c598ea4c4e7d..69809024d71d 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -881,6 +881,7 @@ submit_failed:
881 goto nobufs; 881 goto nobufs;
882 882
883nobufs_unlock_obj: 883nobufs_unlock_obj:
884 spin_unlock(&cookie->stores_lock);
884 spin_unlock(&object->lock); 885 spin_unlock(&object->lock);
885nobufs: 886nobufs:
886 spin_unlock(&cookie->lock); 887 spin_unlock(&cookie->lock);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index ae8d02294e46..ae0d92736531 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -491,7 +491,8 @@ static int nfs_release_page(struct page *page, gfp_t gfp)
491{ 491{
492 dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); 492 dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
493 493
494 if (gfp & __GFP_WAIT) 494 /* Only do I/O if gfp is a superset of GFP_KERNEL */
495 if ((gfp & GFP_KERNEL) == GFP_KERNEL)
495 nfs_wb_page(page->mapping->host, page); 496 nfs_wb_page(page->mapping->host, page);
496 /* If PagePrivate() is set, then the page is not freeable */ 497 /* If PagePrivate() is set, then the page is not freeable */
497 if (PagePrivate(page)) 498 if (PagePrivate(page))
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 4d338be492cb..dd17713413a5 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -5552,6 +5552,8 @@ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, __be32 *p, struct nf
5552 if (status != 0) 5552 if (status != 0)
5553 goto out; 5553 goto out;
5554 status = decode_delegreturn(&xdr); 5554 status = decode_delegreturn(&xdr);
5555 if (status != 0)
5556 goto out;
5555 decode_getfattr(&xdr, res->fattr, res->server, 5557 decode_getfattr(&xdr, res->fattr, res->server,
5556 !RPC_IS_ASYNC(rqstp->rq_task)); 5558 !RPC_IS_ASYNC(rqstp->rq_task));
5557out: 5559out:
diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c
index 0028d2ef0662..90be97f1f5a8 100644
--- a/fs/partitions/msdos.c
+++ b/fs/partitions/msdos.c
@@ -31,14 +31,17 @@
31 */ 31 */
32#include <asm/unaligned.h> 32#include <asm/unaligned.h>
33 33
34#define SYS_IND(p) (get_unaligned(&p->sys_ind)) 34#define SYS_IND(p) get_unaligned(&p->sys_ind)
35#define NR_SECTS(p) ({ __le32 __a = get_unaligned(&p->nr_sects); \
36 le32_to_cpu(__a); \
37 })
38 35
39#define START_SECT(p) ({ __le32 __a = get_unaligned(&p->start_sect); \ 36static inline sector_t nr_sects(struct partition *p)
40 le32_to_cpu(__a); \ 37{
41 }) 38 return (sector_t)get_unaligned_le32(&p->nr_sects);
39}
40
41static inline sector_t start_sect(struct partition *p)
42{
43 return (sector_t)get_unaligned_le32(&p->start_sect);
44}
42 45
43static inline int is_extended_partition(struct partition *p) 46static inline int is_extended_partition(struct partition *p)
44{ 47{
@@ -104,13 +107,13 @@ static int aix_magic_present(unsigned char *p, struct block_device *bdev)
104 107
105static void 108static void
106parse_extended(struct parsed_partitions *state, struct block_device *bdev, 109parse_extended(struct parsed_partitions *state, struct block_device *bdev,
107 u32 first_sector, u32 first_size) 110 sector_t first_sector, sector_t first_size)
108{ 111{
109 struct partition *p; 112 struct partition *p;
110 Sector sect; 113 Sector sect;
111 unsigned char *data; 114 unsigned char *data;
112 u32 this_sector, this_size; 115 sector_t this_sector, this_size;
113 int sector_size = bdev_logical_block_size(bdev) / 512; 116 sector_t sector_size = bdev_logical_block_size(bdev) / 512;
114 int loopct = 0; /* number of links followed 117 int loopct = 0; /* number of links followed
115 without finding a data partition */ 118 without finding a data partition */
116 int i; 119 int i;
@@ -145,14 +148,14 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev,
145 * First process the data partition(s) 148 * First process the data partition(s)
146 */ 149 */
147 for (i=0; i<4; i++, p++) { 150 for (i=0; i<4; i++, p++) {
148 u32 offs, size, next; 151 sector_t offs, size, next;
149 if (!NR_SECTS(p) || is_extended_partition(p)) 152 if (!nr_sects(p) || is_extended_partition(p))
150 continue; 153 continue;
151 154
152 /* Check the 3rd and 4th entries - 155 /* Check the 3rd and 4th entries -
153 these sometimes contain random garbage */ 156 these sometimes contain random garbage */
154 offs = START_SECT(p)*sector_size; 157 offs = start_sect(p)*sector_size;
155 size = NR_SECTS(p)*sector_size; 158 size = nr_sects(p)*sector_size;
156 next = this_sector + offs; 159 next = this_sector + offs;
157 if (i >= 2) { 160 if (i >= 2) {
158 if (offs + size > this_size) 161 if (offs + size > this_size)
@@ -179,13 +182,13 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev,
179 */ 182 */
180 p -= 4; 183 p -= 4;
181 for (i=0; i<4; i++, p++) 184 for (i=0; i<4; i++, p++)
182 if (NR_SECTS(p) && is_extended_partition(p)) 185 if (nr_sects(p) && is_extended_partition(p))
183 break; 186 break;
184 if (i == 4) 187 if (i == 4)
185 goto done; /* nothing left to do */ 188 goto done; /* nothing left to do */
186 189
187 this_sector = first_sector + START_SECT(p) * sector_size; 190 this_sector = first_sector + start_sect(p) * sector_size;
188 this_size = NR_SECTS(p) * sector_size; 191 this_size = nr_sects(p) * sector_size;
189 put_dev_sector(sect); 192 put_dev_sector(sect);
190 } 193 }
191done: 194done:
@@ -197,7 +200,7 @@ done:
197 200
198static void 201static void
199parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev, 202parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
200 u32 offset, u32 size, int origin) 203 sector_t offset, sector_t size, int origin)
201{ 204{
202#ifdef CONFIG_SOLARIS_X86_PARTITION 205#ifdef CONFIG_SOLARIS_X86_PARTITION
203 Sector sect; 206 Sector sect;
@@ -244,7 +247,7 @@ parse_solaris_x86(struct parsed_partitions *state, struct block_device *bdev,
244 */ 247 */
245static void 248static void
246parse_bsd(struct parsed_partitions *state, struct block_device *bdev, 249parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
247 u32 offset, u32 size, int origin, char *flavour, 250 sector_t offset, sector_t size, int origin, char *flavour,
248 int max_partitions) 251 int max_partitions)
249{ 252{
250 Sector sect; 253 Sector sect;
@@ -263,7 +266,7 @@ parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
263 if (le16_to_cpu(l->d_npartitions) < max_partitions) 266 if (le16_to_cpu(l->d_npartitions) < max_partitions)
264 max_partitions = le16_to_cpu(l->d_npartitions); 267 max_partitions = le16_to_cpu(l->d_npartitions);
265 for (p = l->d_partitions; p - l->d_partitions < max_partitions; p++) { 268 for (p = l->d_partitions; p - l->d_partitions < max_partitions; p++) {
266 u32 bsd_start, bsd_size; 269 sector_t bsd_start, bsd_size;
267 270
268 if (state->next == state->limit) 271 if (state->next == state->limit)
269 break; 272 break;
@@ -290,7 +293,7 @@ parse_bsd(struct parsed_partitions *state, struct block_device *bdev,
290 293
291static void 294static void
292parse_freebsd(struct parsed_partitions *state, struct block_device *bdev, 295parse_freebsd(struct parsed_partitions *state, struct block_device *bdev,
293 u32 offset, u32 size, int origin) 296 sector_t offset, sector_t size, int origin)
294{ 297{
295#ifdef CONFIG_BSD_DISKLABEL 298#ifdef CONFIG_BSD_DISKLABEL
296 parse_bsd(state, bdev, offset, size, origin, 299 parse_bsd(state, bdev, offset, size, origin,
@@ -300,7 +303,7 @@ parse_freebsd(struct parsed_partitions *state, struct block_device *bdev,
300 303
301static void 304static void
302parse_netbsd(struct parsed_partitions *state, struct block_device *bdev, 305parse_netbsd(struct parsed_partitions *state, struct block_device *bdev,
303 u32 offset, u32 size, int origin) 306 sector_t offset, sector_t size, int origin)
304{ 307{
305#ifdef CONFIG_BSD_DISKLABEL 308#ifdef CONFIG_BSD_DISKLABEL
306 parse_bsd(state, bdev, offset, size, origin, 309 parse_bsd(state, bdev, offset, size, origin,
@@ -310,7 +313,7 @@ parse_netbsd(struct parsed_partitions *state, struct block_device *bdev,
310 313
311static void 314static void
312parse_openbsd(struct parsed_partitions *state, struct block_device *bdev, 315parse_openbsd(struct parsed_partitions *state, struct block_device *bdev,
313 u32 offset, u32 size, int origin) 316 sector_t offset, sector_t size, int origin)
314{ 317{
315#ifdef CONFIG_BSD_DISKLABEL 318#ifdef CONFIG_BSD_DISKLABEL
316 parse_bsd(state, bdev, offset, size, origin, 319 parse_bsd(state, bdev, offset, size, origin,
@@ -324,7 +327,7 @@ parse_openbsd(struct parsed_partitions *state, struct block_device *bdev,
324 */ 327 */
325static void 328static void
326parse_unixware(struct parsed_partitions *state, struct block_device *bdev, 329parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
327 u32 offset, u32 size, int origin) 330 sector_t offset, sector_t size, int origin)
328{ 331{
329#ifdef CONFIG_UNIXWARE_DISKLABEL 332#ifdef CONFIG_UNIXWARE_DISKLABEL
330 Sector sect; 333 Sector sect;
@@ -348,7 +351,8 @@ parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
348 351
349 if (p->s_label != UNIXWARE_FS_UNUSED) 352 if (p->s_label != UNIXWARE_FS_UNUSED)
350 put_partition(state, state->next++, 353 put_partition(state, state->next++,
351 START_SECT(p), NR_SECTS(p)); 354 le32_to_cpu(p->start_sect),
355 le32_to_cpu(p->nr_sects));
352 p++; 356 p++;
353 } 357 }
354 put_dev_sector(sect); 358 put_dev_sector(sect);
@@ -363,7 +367,7 @@ parse_unixware(struct parsed_partitions *state, struct block_device *bdev,
363 */ 367 */
364static void 368static void
365parse_minix(struct parsed_partitions *state, struct block_device *bdev, 369parse_minix(struct parsed_partitions *state, struct block_device *bdev,
366 u32 offset, u32 size, int origin) 370 sector_t offset, sector_t size, int origin)
367{ 371{
368#ifdef CONFIG_MINIX_SUBPARTITION 372#ifdef CONFIG_MINIX_SUBPARTITION
369 Sector sect; 373 Sector sect;
@@ -390,7 +394,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
390 /* add each partition in use */ 394 /* add each partition in use */
391 if (SYS_IND(p) == MINIX_PARTITION) 395 if (SYS_IND(p) == MINIX_PARTITION)
392 put_partition(state, state->next++, 396 put_partition(state, state->next++,
393 START_SECT(p), NR_SECTS(p)); 397 start_sect(p), nr_sects(p));
394 } 398 }
395 printk(" >\n"); 399 printk(" >\n");
396 } 400 }
@@ -401,7 +405,7 @@ parse_minix(struct parsed_partitions *state, struct block_device *bdev,
401static struct { 405static struct {
402 unsigned char id; 406 unsigned char id;
403 void (*parse)(struct parsed_partitions *, struct block_device *, 407 void (*parse)(struct parsed_partitions *, struct block_device *,
404 u32, u32, int); 408 sector_t, sector_t, int);
405} subtypes[] = { 409} subtypes[] = {
406 {FREEBSD_PARTITION, parse_freebsd}, 410 {FREEBSD_PARTITION, parse_freebsd},
407 {NETBSD_PARTITION, parse_netbsd}, 411 {NETBSD_PARTITION, parse_netbsd},
@@ -415,7 +419,7 @@ static struct {
415 419
416int msdos_partition(struct parsed_partitions *state, struct block_device *bdev) 420int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
417{ 421{
418 int sector_size = bdev_logical_block_size(bdev) / 512; 422 sector_t sector_size = bdev_logical_block_size(bdev) / 512;
419 Sector sect; 423 Sector sect;
420 unsigned char *data; 424 unsigned char *data;
421 struct partition *p; 425 struct partition *p;
@@ -483,14 +487,21 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
483 487
484 state->next = 5; 488 state->next = 5;
485 for (slot = 1 ; slot <= 4 ; slot++, p++) { 489 for (slot = 1 ; slot <= 4 ; slot++, p++) {
486 u32 start = START_SECT(p)*sector_size; 490 sector_t start = start_sect(p)*sector_size;
487 u32 size = NR_SECTS(p)*sector_size; 491 sector_t size = nr_sects(p)*sector_size;
488 if (!size) 492 if (!size)
489 continue; 493 continue;
490 if (is_extended_partition(p)) { 494 if (is_extended_partition(p)) {
491 /* prevent someone doing mkfs or mkswap on an 495 /*
492 extended partition, but leave room for LILO */ 496 * prevent someone doing mkfs or mkswap on an
493 put_partition(state, slot, start, size == 1 ? 1 : 2); 497 * extended partition, but leave room for LILO
498 * FIXME: this uses one logical sector for > 512b
499 * sector, although it may not be enough/proper.
500 */
501 sector_t n = 2;
502 n = min(size, max(sector_size, n));
503 put_partition(state, slot, start, n);
504
494 printk(" <"); 505 printk(" <");
495 parse_extended(state, bdev, start, size); 506 parse_extended(state, bdev, start, size);
496 printk(" >"); 507 printk(" >");
@@ -513,7 +524,7 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
513 unsigned char id = SYS_IND(p); 524 unsigned char id = SYS_IND(p);
514 int n; 525 int n;
515 526
516 if (!NR_SECTS(p)) 527 if (!nr_sects(p))
517 continue; 528 continue;
518 529
519 for (n = 0; subtypes[n].parse && id != subtypes[n].id; n++) 530 for (n = 0; subtypes[n].parse && id != subtypes[n].id; n++)
@@ -521,8 +532,8 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
521 532
522 if (!subtypes[n].parse) 533 if (!subtypes[n].parse)
523 continue; 534 continue;
524 subtypes[n].parse(state, bdev, START_SECT(p)*sector_size, 535 subtypes[n].parse(state, bdev, start_sect(p)*sector_size,
525 NR_SECTS(p)*sector_size, slot); 536 nr_sects(p)*sector_size, slot);
526 } 537 }
527 put_dev_sector(sect); 538 put_dev_sector(sect);
528 return 1; 539 return 1;
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index a44a7897fd4d..b442dac8f5f9 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -490,7 +490,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
490 } 490 }
491 read_unlock(&kclist_lock); 491 read_unlock(&kclist_lock);
492 492
493 if (m == NULL) { 493 if (&m->list == &kclist_head) {
494 if (clear_user(buffer, tsz)) 494 if (clear_user(buffer, tsz))
495 return -EFAULT; 495 return -EFAULT;
496 } else if (is_vmalloc_or_module_addr((void *)start)) { 496 } else if (is_vmalloc_or_module_addr((void *)start)) {
diff --git a/fs/read_write.c b/fs/read_write.c
index b7f4a1f94d48..113386d6fd2d 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -258,6 +258,7 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp
258 init_sync_kiocb(&kiocb, filp); 258 init_sync_kiocb(&kiocb, filp);
259 kiocb.ki_pos = *ppos; 259 kiocb.ki_pos = *ppos;
260 kiocb.ki_left = len; 260 kiocb.ki_left = len;
261 kiocb.ki_nbytes = len;
261 262
262 for (;;) { 263 for (;;) {
263 ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos); 264 ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
@@ -313,6 +314,7 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof
313 init_sync_kiocb(&kiocb, filp); 314 init_sync_kiocb(&kiocb, filp);
314 kiocb.ki_pos = *ppos; 315 kiocb.ki_pos = *ppos;
315 kiocb.ki_left = len; 316 kiocb.ki_left = len;
317 kiocb.ki_nbytes = len;
316 318
317 for (;;) { 319 for (;;) {
318 ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos); 320 ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index ba98546fabbd..f3de5e8a2ae8 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2217,6 +2217,15 @@ static int journal_read_transaction(struct super_block *sb,
2217 brelse(d_bh); 2217 brelse(d_bh);
2218 return 1; 2218 return 1;
2219 } 2219 }
2220
2221 if (bdev_read_only(sb->s_bdev)) {
2222 reiserfs_warning(sb, "clm-2076",
2223 "device is readonly, unable to replay log");
2224 brelse(c_bh);
2225 brelse(d_bh);
2226 return -EROFS;
2227 }
2228
2220 trans_id = get_desc_trans_id(desc); 2229 trans_id = get_desc_trans_id(desc);
2221 /* now we know we've got a good transaction, and it was inside the valid time ranges */ 2230 /* now we know we've got a good transaction, and it was inside the valid time ranges */
2222 log_blocks = kmalloc(get_desc_trans_len(desc) * 2231 log_blocks = kmalloc(get_desc_trans_len(desc) *
@@ -2459,12 +2468,6 @@ static int journal_read(struct super_block *sb)
2459 goto start_log_replay; 2468 goto start_log_replay;
2460 } 2469 }
2461 2470
2462 if (continue_replay && bdev_read_only(sb->s_bdev)) {
2463 reiserfs_warning(sb, "clm-2076",
2464 "device is readonly, unable to replay log");
2465 return -1;
2466 }
2467
2468 /* ok, there are transactions that need to be replayed. start with the first log block, find 2471 /* ok, there are transactions that need to be replayed. start with the first log block, find
2469 ** all the valid transactions, and pick out the oldest. 2472 ** all the valid transactions, and pick out the oldest.
2470 */ 2473 */
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index d8b5bfcbdd30..de1fcffd906b 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -76,7 +76,7 @@ int reiserfs_security_init(struct inode *dir, struct inode *inode,
76 return error; 76 return error;
77 } 77 }
78 78
79 if (sec->length) { 79 if (sec->length && reiserfs_xattrs_initialized(inode->i_sb)) {
80 blocks = reiserfs_xattr_jcreate_nblocks(inode) + 80 blocks = reiserfs_xattr_jcreate_nblocks(inode) +
81 reiserfs_xattr_nblocks(inode, sec->length); 81 reiserfs_xattr_nblocks(inode, sec->length);
82 /* We don't want to count the directories twice if we have 82 /* We don't want to count the directories twice if we have
diff --git a/include/acpi/acpi_drivers.h b/include/acpi/acpi_drivers.h
index 3a4767c01c5f..4f7b44866b76 100644
--- a/include/acpi/acpi_drivers.h
+++ b/include/acpi/acpi_drivers.h
@@ -65,6 +65,8 @@
65#define ACPI_VIDEO_HID "LNXVIDEO" 65#define ACPI_VIDEO_HID "LNXVIDEO"
66#define ACPI_BAY_HID "LNXIOBAY" 66#define ACPI_BAY_HID "LNXIOBAY"
67#define ACPI_DOCK_HID "LNXDOCK" 67#define ACPI_DOCK_HID "LNXDOCK"
68/* Quirk for broken IBM BIOSes */
69#define ACPI_SMBUS_IBM_HID "SMBUSIBM"
68 70
69/* 71/*
70 * For fixed hardware buttons, we fabricate acpi_devices with HID 72 * For fixed hardware buttons, we fabricate acpi_devices with HID
diff --git a/include/linux/circ_buf.h b/include/linux/circ_buf.h
index a2ed0591fb19..90f2471dc6f2 100644
--- a/include/linux/circ_buf.h
+++ b/include/linux/circ_buf.h
@@ -1,3 +1,7 @@
1/*
2 * See Documentation/circular-buffers.txt for more information.
3 */
4
1#ifndef _LINUX_CIRC_BUF_H 5#ifndef _LINUX_CIRC_BUF_H
2#define _LINUX_CIRC_BUF_H 1 6#define _LINUX_CIRC_BUF_H 1
3 7
diff --git a/include/linux/device.h b/include/linux/device.h
index 182192892d45..241b96bcd7ad 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -451,6 +451,10 @@ struct device {
451 451
452static inline const char *dev_name(const struct device *dev) 452static inline const char *dev_name(const struct device *dev)
453{ 453{
454 /* Use the init name until the kobject becomes available */
455 if (dev->init_name)
456 return dev->init_name;
457
454 return kobject_name(&dev->kobj); 458 return kobject_name(&dev->kobj);
455} 459}
456 460
diff --git a/include/linux/if_tunnel.h b/include/linux/if_tunnel.h
index 1822d635be6b..16b92d008bed 100644
--- a/include/linux/if_tunnel.h
+++ b/include/linux/if_tunnel.h
@@ -2,6 +2,7 @@
2#define _IF_TUNNEL_H_ 2#define _IF_TUNNEL_H_
3 3
4#include <linux/types.h> 4#include <linux/types.h>
5#include <asm/byteorder.h>
5 6
6#ifdef __KERNEL__ 7#ifdef __KERNEL__
7#include <linux/ip.h> 8#include <linux/ip.h>
diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h
index bc0fc795bd35..ece0b1c33816 100644
--- a/include/linux/kfifo.h
+++ b/include/linux/kfifo.h
@@ -102,8 +102,6 @@ union { \
102 unsigned char name##kfifo_buffer[size]; \ 102 unsigned char name##kfifo_buffer[size]; \
103 struct kfifo name = __kfifo_initializer(size, name##kfifo_buffer) 103 struct kfifo name = __kfifo_initializer(size, name##kfifo_buffer)
104 104
105#undef __kfifo_initializer
106
107extern void kfifo_init(struct kfifo *fifo, void *buffer, 105extern void kfifo_init(struct kfifo *fifo, void *buffer,
108 unsigned int size); 106 unsigned int size);
109extern __must_check int kfifo_alloc(struct kfifo *fifo, unsigned int size, 107extern __must_check int kfifo_alloc(struct kfifo *fifo, unsigned int size,
diff --git a/include/linux/mmc/mmc.h b/include/linux/mmc/mmc.h
index c02c8db73701..8a49cbf0376d 100644
--- a/include/linux/mmc/mmc.h
+++ b/include/linux/mmc/mmc.h
@@ -268,6 +268,7 @@ struct _mmc_csd {
268 268
269#define EXT_CSD_CARD_TYPE_26 (1<<0) /* Card can run at 26MHz */ 269#define EXT_CSD_CARD_TYPE_26 (1<<0) /* Card can run at 26MHz */
270#define EXT_CSD_CARD_TYPE_52 (1<<1) /* Card can run at 52MHz */ 270#define EXT_CSD_CARD_TYPE_52 (1<<1) /* Card can run at 52MHz */
271#define EXT_CSD_CARD_TYPE_MASK 0x3 /* Mask out reserved and DDR bits */
271 272
272#define EXT_CSD_BUS_WIDTH_1 0 /* Card is in 1 bit mode */ 273#define EXT_CSD_BUS_WIDTH_1 0 /* Card is in 1 bit mode */
273#define EXT_CSD_BUS_WIDTH_4 1 /* Card is in 4 bit mode */ 274#define EXT_CSD_BUS_WIDTH_4 1 /* Card is in 4 bit mode */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c79a88be7c33..fa8b47637997 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2059,12 +2059,12 @@ static inline void skb_bond_set_mac_by_master(struct sk_buff *skb,
2059 * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and 2059 * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
2060 * ARP on active-backup slaves with arp_validate enabled. 2060 * ARP on active-backup slaves with arp_validate enabled.
2061 */ 2061 */
2062static inline int skb_bond_should_drop(struct sk_buff *skb) 2062static inline int skb_bond_should_drop(struct sk_buff *skb,
2063 struct net_device *master)
2063{ 2064{
2064 struct net_device *dev = skb->dev;
2065 struct net_device *master = dev->master;
2066
2067 if (master) { 2065 if (master) {
2066 struct net_device *dev = skb->dev;
2067
2068 if (master->priv_flags & IFF_MASTER_ARPMON) 2068 if (master->priv_flags & IFF_MASTER_ARPMON)
2069 dev->last_rx = jiffies; 2069 dev->last_rx = jiffies;
2070 2070
diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h
index 53923868c9bd..361d6b5630ee 100644
--- a/include/linux/netfilter/nfnetlink.h
+++ b/include/linux/netfilter/nfnetlink.h
@@ -76,7 +76,7 @@ extern int nfnetlink_subsys_unregister(const struct nfnetlink_subsystem *n);
76extern int nfnetlink_has_listeners(struct net *net, unsigned int group); 76extern int nfnetlink_has_listeners(struct net *net, unsigned int group);
77extern int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned group, 77extern int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned group,
78 int echo, gfp_t flags); 78 int echo, gfp_t flags);
79extern void nfnetlink_set_err(struct net *net, u32 pid, u32 group, int error); 79extern int nfnetlink_set_err(struct net *net, u32 pid, u32 group, int error);
80extern int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u_int32_t pid, int flags); 80extern int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u_int32_t pid, int flags);
81 81
82extern void nfnl_lock(void); 82extern void nfnl_lock(void);
diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index fde27c017326..6eaca5e1e8ca 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -188,7 +188,7 @@ extern int netlink_has_listeners(struct sock *sk, unsigned int group);
188extern int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 pid, int nonblock); 188extern int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 pid, int nonblock);
189extern int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, __u32 pid, 189extern int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, __u32 pid,
190 __u32 group, gfp_t allocation); 190 __u32 group, gfp_t allocation);
191extern void netlink_set_err(struct sock *ssk, __u32 pid, __u32 group, int code); 191extern int netlink_set_err(struct sock *ssk, __u32 pid, __u32 group, int code);
192extern int netlink_register_notifier(struct notifier_block *nb); 192extern int netlink_register_notifier(struct notifier_block *nb);
193extern int netlink_unregister_notifier(struct notifier_block *nb); 193extern int netlink_unregister_notifier(struct notifier_block *nb);
194 194
diff --git a/include/linux/reiserfs_xattr.h b/include/linux/reiserfs_xattr.h
index 99928dce37ea..7fa02b4af838 100644
--- a/include/linux/reiserfs_xattr.h
+++ b/include/linux/reiserfs_xattr.h
@@ -70,6 +70,11 @@ int reiserfs_security_write(struct reiserfs_transaction_handle *th,
70void reiserfs_security_free(struct reiserfs_security_handle *sec); 70void reiserfs_security_free(struct reiserfs_security_handle *sec);
71#endif 71#endif
72 72
73static inline int reiserfs_xattrs_initialized(struct super_block *sb)
74{
75 return REISERFS_SB(sb)->priv_root != NULL;
76}
77
73#define xattr_size(size) ((size) + sizeof(struct reiserfs_xattr_header)) 78#define xattr_size(size) ((size) + sizeof(struct reiserfs_xattr_header))
74static inline loff_t reiserfs_xattr_nblocks(struct inode *inode, loff_t size) 79static inline loff_t reiserfs_xattr_nblocks(struct inode *inode, loff_t size)
75{ 80{
diff --git a/include/linux/serial_sci.h b/include/linux/serial_sci.h
index 1b177d29a7f0..f5364a1de68b 100644
--- a/include/linux/serial_sci.h
+++ b/include/linux/serial_sci.h
@@ -2,7 +2,9 @@
2#define __LINUX_SERIAL_SCI_H 2#define __LINUX_SERIAL_SCI_H
3 3
4#include <linux/serial_core.h> 4#include <linux/serial_core.h>
5#ifdef CONFIG_SERIAL_SH_SCI_DMA
5#include <asm/dmaengine.h> 6#include <asm/dmaengine.h>
7#endif
6 8
7/* 9/*
8 * Generic header for SuperH SCI(F) (used by sh/sh64/h8300 and related parts) 10 * Generic header for SuperH SCI(F) (used by sh/sh64/h8300 and related parts)
@@ -30,8 +32,10 @@ struct plat_sci_port {
30 upf_t flags; /* UPF_* flags */ 32 upf_t flags; /* UPF_* flags */
31 char *clk; /* clock string */ 33 char *clk; /* clock string */
32 struct device *dma_dev; 34 struct device *dma_dev;
33 enum sh_dmae_slave_chan_id dma_slave_tx; 35#ifdef CONFIG_SERIAL_SH_SCI_DMA
34 enum sh_dmae_slave_chan_id dma_slave_rx; 36 unsigned int dma_slave_tx;
37 unsigned int dma_slave_rx;
38#endif
35}; 39};
36 40
37#endif /* __LINUX_SERIAL_SCI_H */ 41#endif /* __LINUX_SERIAL_SCI_H */
diff --git a/include/linux/sh_dma.h b/include/linux/sh_dma.h
new file mode 100644
index 000000000000..cdaaff424211
--- /dev/null
+++ b/include/linux/sh_dma.h
@@ -0,0 +1,101 @@
1/*
2 * Header for the new SH dmaengine driver
3 *
4 * Copyright (C) 2010 Guennadi Liakhovetski <g.liakhovetski@gmx.de>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10#ifndef SH_DMA_H
11#define SH_DMA_H
12
13#include <linux/list.h>
14#include <linux/dmaengine.h>
15
16/* Used by slave DMA clients to request DMA to/from a specific peripheral */
17struct sh_dmae_slave {
18 unsigned int slave_id; /* Set by the platform */
19 struct device *dma_dev; /* Set by the platform */
20 struct sh_dmae_slave_config *config; /* Set by the driver */
21};
22
23struct sh_dmae_regs {
24 u32 sar; /* SAR / source address */
25 u32 dar; /* DAR / destination address */
26 u32 tcr; /* TCR / transfer count */
27};
28
29struct sh_desc {
30 struct sh_dmae_regs hw;
31 struct list_head node;
32 struct dma_async_tx_descriptor async_tx;
33 enum dma_data_direction direction;
34 dma_cookie_t cookie;
35 size_t partial;
36 int chunks;
37 int mark;
38};
39struct sh_dmae_slave_config {
40 unsigned int slave_id;
41 dma_addr_t addr;
42 u32 chcr;
43 char mid_rid;
44};
45
46struct sh_dmae_channel {
47 unsigned int offset;
48 unsigned int dmars;
49 unsigned int dmars_bit;
50};
51
52struct sh_dmae_pdata {
53 struct sh_dmae_slave_config *slave;
54 int slave_num;
55 struct sh_dmae_channel *channel;
56 int channel_num;
57 unsigned int ts_low_shift;
58 unsigned int ts_low_mask;
59 unsigned int ts_high_shift;
60 unsigned int ts_high_mask;
61 unsigned int *ts_shift;
62 int ts_shift_num;
63 u16 dmaor_init;
64};
65
66/* DMA register */
67#define SAR 0x00
68#define DAR 0x04
69#define TCR 0x08
70#define CHCR 0x0C
71#define DMAOR 0x40
72
73/* DMAOR definitions */
74#define DMAOR_AE 0x00000004
75#define DMAOR_NMIF 0x00000002
76#define DMAOR_DME 0x00000001
77
78/* Definitions for the SuperH DMAC */
79#define REQ_L 0x00000000
80#define REQ_E 0x00080000
81#define RACK_H 0x00000000
82#define RACK_L 0x00040000
83#define ACK_R 0x00000000
84#define ACK_W 0x00020000
85#define ACK_H 0x00000000
86#define ACK_L 0x00010000
87#define DM_INC 0x00004000
88#define DM_DEC 0x00008000
89#define DM_FIX 0x0000c000
90#define SM_INC 0x00001000
91#define SM_DEC 0x00002000
92#define SM_FIX 0x00003000
93#define RS_IN 0x00000200
94#define RS_OUT 0x00000300
95#define TS_BLK 0x00000040
96#define TM_BUR 0x00000020
97#define CHCR_DE 0x00000001
98#define CHCR_TE 0x00000002
99#define CHCR_IE 0x00000004
100
101#endif
diff --git a/include/linux/sunrpc/bc_xprt.h b/include/linux/sunrpc/bc_xprt.h
index d7152b451e21..7c91260c44a9 100644
--- a/include/linux/sunrpc/bc_xprt.h
+++ b/include/linux/sunrpc/bc_xprt.h
@@ -36,7 +36,6 @@ struct rpc_rqst *xprt_alloc_bc_request(struct rpc_xprt *xprt);
36void xprt_free_bc_request(struct rpc_rqst *req); 36void xprt_free_bc_request(struct rpc_rqst *req);
37int xprt_setup_backchannel(struct rpc_xprt *, unsigned int min_reqs); 37int xprt_setup_backchannel(struct rpc_xprt *, unsigned int min_reqs);
38void xprt_destroy_backchannel(struct rpc_xprt *, int max_reqs); 38void xprt_destroy_backchannel(struct rpc_xprt *, int max_reqs);
39void bc_release_request(struct rpc_task *);
40int bc_send(struct rpc_rqst *req); 39int bc_send(struct rpc_rqst *req);
41 40
42/* 41/*
@@ -59,6 +58,10 @@ static inline int svc_is_backchannel(const struct svc_rqst *rqstp)
59{ 58{
60 return 0; 59 return 0;
61} 60}
61
62static inline void xprt_free_bc_request(struct rpc_rqst *req)
63{
64}
62#endif /* CONFIG_NFS_V4_1 */ 65#endif /* CONFIG_NFS_V4_1 */
63#endif /* _LINUX_SUNRPC_BC_XPRT_H */ 66#endif /* _LINUX_SUNRPC_BC_XPRT_H */
64 67
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index f994ae58a002..057929b0a651 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -688,7 +688,7 @@ asmlinkage long sys_shmat(int shmid, char __user *shmaddr, int shmflg);
688asmlinkage long sys_shmget(key_t key, size_t size, int flag); 688asmlinkage long sys_shmget(key_t key, size_t size, int flag);
689asmlinkage long sys_shmdt(char __user *shmaddr); 689asmlinkage long sys_shmdt(char __user *shmaddr);
690asmlinkage long sys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf); 690asmlinkage long sys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf);
691asmlinkage long sys_ipc(unsigned int call, int first, int second, 691asmlinkage long sys_ipc(unsigned int call, int first, unsigned long second,
692 unsigned long third, void __user *ptr, long fifth); 692 unsigned long third, void __user *ptr, long fifth);
693 693
694asmlinkage long sys_mq_open(const char __user *name, int oflag, mode_t mode, struct mq_attr __user *attr); 694asmlinkage long sys_mq_open(const char __user *name, int oflag, mode_t mode, struct mq_attr __user *attr);
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 568369a86306..4409967db0c4 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -70,12 +70,13 @@ struct tty_buffer {
70 70
71/* 71/*
72 * We default to dicing tty buffer allocations to this many characters 72 * We default to dicing tty buffer allocations to this many characters
73 * in order to avoid multiple page allocations. We assume tty_buffer itself 73 * in order to avoid multiple page allocations. We know the size of
74 * is under 256 bytes. See tty_buffer_find for the allocation logic this 74 * tty_buffer itself but it must also be taken into account that the
75 * must match 75 * the buffer is 256 byte aligned. See tty_buffer_find for the allocation
76 * logic this must match
76 */ 77 */
77 78
78#define TTY_BUFFER_PAGE ((PAGE_SIZE - 256) / 2) 79#define TTY_BUFFER_PAGE (((PAGE_SIZE - sizeof(struct tty_buffer)) / 2) & ~0xFF)
79 80
80 81
81struct tty_bufhead { 82struct tty_bufhead {
@@ -223,6 +224,7 @@ struct tty_port {
223 wait_queue_head_t close_wait; /* Close waiters */ 224 wait_queue_head_t close_wait; /* Close waiters */
224 wait_queue_head_t delta_msr_wait; /* Modem status change */ 225 wait_queue_head_t delta_msr_wait; /* Modem status change */
225 unsigned long flags; /* TTY flags ASY_*/ 226 unsigned long flags; /* TTY flags ASY_*/
227 unsigned char console:1; /* port is a console */
226 struct mutex mutex; /* Locking */ 228 struct mutex mutex; /* Locking */
227 struct mutex buf_mutex; /* Buffer alloc lock */ 229 struct mutex buf_mutex; /* Buffer alloc lock */
228 unsigned char *xmit_buf; /* Optional buffer */ 230 unsigned char *xmit_buf; /* Optional buffer */
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 8c9f053111bb..ce1323c4e47c 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -1055,7 +1055,8 @@ typedef void (*usb_complete_t)(struct urb *);
1055 * @number_of_packets: Lists the number of ISO transfer buffers. 1055 * @number_of_packets: Lists the number of ISO transfer buffers.
1056 * @interval: Specifies the polling interval for interrupt or isochronous 1056 * @interval: Specifies the polling interval for interrupt or isochronous
1057 * transfers. The units are frames (milliseconds) for full and low 1057 * transfers. The units are frames (milliseconds) for full and low
1058 * speed devices, and microframes (1/8 millisecond) for highspeed ones. 1058 * speed devices, and microframes (1/8 millisecond) for highspeed
1059 * and SuperSpeed devices.
1059 * @error_count: Returns the number of ISO transfers that reported errors. 1060 * @error_count: Returns the number of ISO transfers that reported errors.
1060 * @context: For use in completion functions. This normally points to 1061 * @context: For use in completion functions. This normally points to
1061 * request-specific driver context. 1062 * request-specific driver context.
@@ -1286,9 +1287,16 @@ static inline void usb_fill_bulk_urb(struct urb *urb,
1286 * 1287 *
1287 * Initializes a interrupt urb with the proper information needed to submit 1288 * Initializes a interrupt urb with the proper information needed to submit
1288 * it to a device. 1289 * it to a device.
1289 * Note that high speed interrupt endpoints use a logarithmic encoding of 1290 *
1290 * the endpoint interval, and express polling intervals in microframes 1291 * Note that High Speed and SuperSpeed interrupt endpoints use a logarithmic
1291 * (eight per millisecond) rather than in frames (one per millisecond). 1292 * encoding of the endpoint interval, and express polling intervals in
1293 * microframes (eight per millisecond) rather than in frames (one per
1294 * millisecond).
1295 *
1296 * Wireless USB also uses the logarithmic encoding, but specifies it in units of
1297 * 128us instead of 125us. For Wireless USB devices, the interval is passed
1298 * through to the host controller, rather than being translated into microframe
1299 * units.
1292 */ 1300 */
1293static inline void usb_fill_int_urb(struct urb *urb, 1301static inline void usb_fill_int_urb(struct urb *urb,
1294 struct usb_device *dev, 1302 struct usb_device *dev,
@@ -1305,7 +1313,7 @@ static inline void usb_fill_int_urb(struct urb *urb,
1305 urb->transfer_buffer_length = buffer_length; 1313 urb->transfer_buffer_length = buffer_length;
1306 urb->complete = complete_fn; 1314 urb->complete = complete_fn;
1307 urb->context = context; 1315 urb->context = context;
1308 if (dev->speed == USB_SPEED_HIGH) 1316 if (dev->speed == USB_SPEED_HIGH || dev->speed == USB_SPEED_SUPER)
1309 urb->interval = 1 << (interval - 1); 1317 urb->interval = 1 << (interval - 1);
1310 else 1318 else
1311 urb->interval = interval; 1319 urb->interval = interval;
diff --git a/include/linux/vt.h b/include/linux/vt.h
index 778b7b2a47d4..d5dd0bc408fd 100644
--- a/include/linux/vt.h
+++ b/include/linux/vt.h
@@ -27,7 +27,7 @@ struct vt_mode {
27#define VT_SETMODE 0x5602 /* set mode of active vt */ 27#define VT_SETMODE 0x5602 /* set mode of active vt */
28#define VT_AUTO 0x00 /* auto vt switching */ 28#define VT_AUTO 0x00 /* auto vt switching */
29#define VT_PROCESS 0x01 /* process controls switching */ 29#define VT_PROCESS 0x01 /* process controls switching */
30#define VT_PROCESS_AUTO 0x02 /* process is notified of switching */ 30#define VT_ACKACQ 0x02 /* acknowledge switch */
31 31
32struct vt_stat { 32struct vt_stat {
33 unsigned short v_active; /* active vt */ 33 unsigned short v_active; /* active vt */
@@ -38,7 +38,6 @@ struct vt_stat {
38#define VT_SENDSIG 0x5604 /* signal to send to bitmask of vts */ 38#define VT_SENDSIG 0x5604 /* signal to send to bitmask of vts */
39 39
40#define VT_RELDISP 0x5605 /* release display */ 40#define VT_RELDISP 0x5605 /* release display */
41#define VT_ACKACQ 0x02 /* acknowledge switch */
42 41
43#define VT_ACTIVATE 0x5606 /* make vt active */ 42#define VT_ACTIVATE 0x5606 /* make vt active */
44#define VT_WAITACTIVE 0x5607 /* wait for vt active */ 43#define VT_WAITACTIVE 0x5607 /* wait for vt active */
diff --git a/include/net/bluetooth/bluetooth.h b/include/net/bluetooth/bluetooth.h
index 04a6908e38d2..ff77e8f882f1 100644
--- a/include/net/bluetooth/bluetooth.h
+++ b/include/net/bluetooth/bluetooth.h
@@ -176,6 +176,6 @@ extern void hci_sock_cleanup(void);
176extern int bt_sysfs_init(void); 176extern int bt_sysfs_init(void);
177extern void bt_sysfs_cleanup(void); 177extern void bt_sysfs_cleanup(void);
178 178
179extern struct class *bt_class; 179extern struct dentry *bt_debugfs;
180 180
181#endif /* __BLUETOOTH_H */ 181#endif /* __BLUETOOTH_H */
diff --git a/include/net/netlink.h b/include/net/netlink.h
index f82e463c875a..4fc05b58503e 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -945,7 +945,11 @@ static inline u64 nla_get_u64(const struct nlattr *nla)
945 */ 945 */
946static inline __be64 nla_get_be64(const struct nlattr *nla) 946static inline __be64 nla_get_be64(const struct nlattr *nla)
947{ 947{
948 return *(__be64 *) nla_data(nla); 948 __be64 tmp;
949
950 nla_memcpy(&tmp, nla, sizeof(tmp));
951
952 return tmp;
949} 953}
950 954
951/** 955/**
diff --git a/init/main.c b/init/main.c
index a1ab78ceb4b6..cbead27caefc 100644
--- a/init/main.c
+++ b/init/main.c
@@ -858,7 +858,7 @@ static int __init kernel_init(void * unused)
858 /* 858 /*
859 * init can allocate pages on any node 859 * init can allocate pages on any node
860 */ 860 */
861 set_mems_allowed(node_possible_map); 861 set_mems_allowed(node_states[N_HIGH_MEMORY]);
862 /* 862 /*
863 * init can run on any cpu. 863 * init can run on any cpu.
864 */ 864 */
diff --git a/ipc/syscall.c b/ipc/syscall.c
index 355a3da9ec73..1d6f53f6b562 100644
--- a/ipc/syscall.c
+++ b/ipc/syscall.c
@@ -13,7 +13,7 @@
13#include <linux/syscalls.h> 13#include <linux/syscalls.h>
14#include <linux/uaccess.h> 14#include <linux/uaccess.h>
15 15
16SYSCALL_DEFINE6(ipc, unsigned int, call, int, first, int, second, 16SYSCALL_DEFINE6(ipc, unsigned int, call, int, first, unsigned long, second,
17 unsigned long, third, void __user *, ptr, long, fifth) 17 unsigned long, third, void __user *, ptr, long, fifth)
18{ 18{
19 int version, ret; 19 int version, ret;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ef909a329750..e2769e13980c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -27,7 +27,6 @@
27 */ 27 */
28 28
29#include <linux/cgroup.h> 29#include <linux/cgroup.h>
30#include <linux/module.h>
31#include <linux/ctype.h> 30#include <linux/ctype.h>
32#include <linux/errno.h> 31#include <linux/errno.h>
33#include <linux/fs.h> 32#include <linux/fs.h>
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index ba401fab459f..d10946748ec2 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -920,9 +920,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
920 * call to guarantee_online_mems(), as we know no one is changing 920 * call to guarantee_online_mems(), as we know no one is changing
921 * our task's cpuset. 921 * our task's cpuset.
922 * 922 *
923 * Hold callback_mutex around the two modifications of our tasks
924 * mems_allowed to synchronize with cpuset_mems_allowed().
925 *
926 * While the mm_struct we are migrating is typically from some 923 * While the mm_struct we are migrating is typically from some
927 * other task, the task_struct mems_allowed that we are hacking 924 * other task, the task_struct mems_allowed that we are hacking
928 * is for our current task, which must allocate new pages for that 925 * is for our current task, which must allocate new pages for that
@@ -973,15 +970,20 @@ static void cpuset_change_nodemask(struct task_struct *p,
973 struct cpuset *cs; 970 struct cpuset *cs;
974 int migrate; 971 int migrate;
975 const nodemask_t *oldmem = scan->data; 972 const nodemask_t *oldmem = scan->data;
976 nodemask_t newmems; 973 NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL);
974
975 if (!newmems)
976 return;
977 977
978 cs = cgroup_cs(scan->cg); 978 cs = cgroup_cs(scan->cg);
979 guarantee_online_mems(cs, &newmems); 979 guarantee_online_mems(cs, newmems);
980 980
981 task_lock(p); 981 task_lock(p);
982 cpuset_change_task_nodemask(p, &newmems); 982 cpuset_change_task_nodemask(p, newmems);
983 task_unlock(p); 983 task_unlock(p);
984 984
985 NODEMASK_FREE(newmems);
986
985 mm = get_task_mm(p); 987 mm = get_task_mm(p);
986 if (!mm) 988 if (!mm)
987 return; 989 return;
@@ -1051,16 +1053,21 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
1051static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, 1053static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1052 const char *buf) 1054 const char *buf)
1053{ 1055{
1054 nodemask_t oldmem; 1056 NODEMASK_ALLOC(nodemask_t, oldmem, GFP_KERNEL);
1055 int retval; 1057 int retval;
1056 struct ptr_heap heap; 1058 struct ptr_heap heap;
1057 1059
1060 if (!oldmem)
1061 return -ENOMEM;
1062
1058 /* 1063 /*
1059 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY]; 1064 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
1060 * it's read-only 1065 * it's read-only
1061 */ 1066 */
1062 if (cs == &top_cpuset) 1067 if (cs == &top_cpuset) {
1063 return -EACCES; 1068 retval = -EACCES;
1069 goto done;
1070 }
1064 1071
1065 /* 1072 /*
1066 * An empty mems_allowed is ok iff there are no tasks in the cpuset. 1073 * An empty mems_allowed is ok iff there are no tasks in the cpuset.
@@ -1076,11 +1083,13 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1076 goto done; 1083 goto done;
1077 1084
1078 if (!nodes_subset(trialcs->mems_allowed, 1085 if (!nodes_subset(trialcs->mems_allowed,
1079 node_states[N_HIGH_MEMORY])) 1086 node_states[N_HIGH_MEMORY])) {
1080 return -EINVAL; 1087 retval = -EINVAL;
1088 goto done;
1089 }
1081 } 1090 }
1082 oldmem = cs->mems_allowed; 1091 *oldmem = cs->mems_allowed;
1083 if (nodes_equal(oldmem, trialcs->mems_allowed)) { 1092 if (nodes_equal(*oldmem, trialcs->mems_allowed)) {
1084 retval = 0; /* Too easy - nothing to do */ 1093 retval = 0; /* Too easy - nothing to do */
1085 goto done; 1094 goto done;
1086 } 1095 }
@@ -1096,10 +1105,11 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1096 cs->mems_allowed = trialcs->mems_allowed; 1105 cs->mems_allowed = trialcs->mems_allowed;
1097 mutex_unlock(&callback_mutex); 1106 mutex_unlock(&callback_mutex);
1098 1107
1099 update_tasks_nodemask(cs, &oldmem, &heap); 1108 update_tasks_nodemask(cs, oldmem, &heap);
1100 1109
1101 heap_free(&heap); 1110 heap_free(&heap);
1102done: 1111done:
1112 NODEMASK_FREE(oldmem);
1103 return retval; 1113 return retval;
1104} 1114}
1105 1115
@@ -1384,40 +1394,47 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1384 struct cgroup *oldcont, struct task_struct *tsk, 1394 struct cgroup *oldcont, struct task_struct *tsk,
1385 bool threadgroup) 1395 bool threadgroup)
1386{ 1396{
1387 nodemask_t from, to;
1388 struct mm_struct *mm; 1397 struct mm_struct *mm;
1389 struct cpuset *cs = cgroup_cs(cont); 1398 struct cpuset *cs = cgroup_cs(cont);
1390 struct cpuset *oldcs = cgroup_cs(oldcont); 1399 struct cpuset *oldcs = cgroup_cs(oldcont);
1400 NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL);
1401 NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL);
1402
1403 if (from == NULL || to == NULL)
1404 goto alloc_fail;
1391 1405
1392 if (cs == &top_cpuset) { 1406 if (cs == &top_cpuset) {
1393 cpumask_copy(cpus_attach, cpu_possible_mask); 1407 cpumask_copy(cpus_attach, cpu_possible_mask);
1394 to = node_possible_map;
1395 } else { 1408 } else {
1396 guarantee_online_cpus(cs, cpus_attach); 1409 guarantee_online_cpus(cs, cpus_attach);
1397 guarantee_online_mems(cs, &to);
1398 } 1410 }
1411 guarantee_online_mems(cs, to);
1399 1412
1400 /* do per-task migration stuff possibly for each in the threadgroup */ 1413 /* do per-task migration stuff possibly for each in the threadgroup */
1401 cpuset_attach_task(tsk, &to, cs); 1414 cpuset_attach_task(tsk, to, cs);
1402 if (threadgroup) { 1415 if (threadgroup) {
1403 struct task_struct *c; 1416 struct task_struct *c;
1404 rcu_read_lock(); 1417 rcu_read_lock();
1405 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { 1418 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
1406 cpuset_attach_task(c, &to, cs); 1419 cpuset_attach_task(c, to, cs);
1407 } 1420 }
1408 rcu_read_unlock(); 1421 rcu_read_unlock();
1409 } 1422 }
1410 1423
1411 /* change mm; only needs to be done once even if threadgroup */ 1424 /* change mm; only needs to be done once even if threadgroup */
1412 from = oldcs->mems_allowed; 1425 *from = oldcs->mems_allowed;
1413 to = cs->mems_allowed; 1426 *to = cs->mems_allowed;
1414 mm = get_task_mm(tsk); 1427 mm = get_task_mm(tsk);
1415 if (mm) { 1428 if (mm) {
1416 mpol_rebind_mm(mm, &to); 1429 mpol_rebind_mm(mm, to);
1417 if (is_memory_migrate(cs)) 1430 if (is_memory_migrate(cs))
1418 cpuset_migrate_mm(mm, &from, &to); 1431 cpuset_migrate_mm(mm, from, to);
1419 mmput(mm); 1432 mmput(mm);
1420 } 1433 }
1434
1435alloc_fail:
1436 NODEMASK_FREE(from);
1437 NODEMASK_FREE(to);
1421} 1438}
1422 1439
1423/* The various types of files and directories in a cpuset file system */ 1440/* The various types of files and directories in a cpuset file system */
@@ -1562,13 +1579,21 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
1562 1579
1563static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) 1580static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1564{ 1581{
1565 nodemask_t mask; 1582 NODEMASK_ALLOC(nodemask_t, mask, GFP_KERNEL);
1583 int retval;
1584
1585 if (mask == NULL)
1586 return -ENOMEM;
1566 1587
1567 mutex_lock(&callback_mutex); 1588 mutex_lock(&callback_mutex);
1568 mask = cs->mems_allowed; 1589 *mask = cs->mems_allowed;
1569 mutex_unlock(&callback_mutex); 1590 mutex_unlock(&callback_mutex);
1570 1591
1571 return nodelist_scnprintf(page, PAGE_SIZE, mask); 1592 retval = nodelist_scnprintf(page, PAGE_SIZE, *mask);
1593
1594 NODEMASK_FREE(mask);
1595
1596 return retval;
1572} 1597}
1573 1598
1574static ssize_t cpuset_common_file_read(struct cgroup *cont, 1599static ssize_t cpuset_common_file_read(struct cgroup *cont,
@@ -1997,7 +2022,10 @@ static void scan_for_empty_cpusets(struct cpuset *root)
1997 struct cpuset *cp; /* scans cpusets being updated */ 2022 struct cpuset *cp; /* scans cpusets being updated */
1998 struct cpuset *child; /* scans child cpusets of cp */ 2023 struct cpuset *child; /* scans child cpusets of cp */
1999 struct cgroup *cont; 2024 struct cgroup *cont;
2000 nodemask_t oldmems; 2025 NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
2026
2027 if (oldmems == NULL)
2028 return;
2001 2029
2002 list_add_tail((struct list_head *)&root->stack_list, &queue); 2030 list_add_tail((struct list_head *)&root->stack_list, &queue);
2003 2031
@@ -2014,7 +2042,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2014 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) 2042 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
2015 continue; 2043 continue;
2016 2044
2017 oldmems = cp->mems_allowed; 2045 *oldmems = cp->mems_allowed;
2018 2046
2019 /* Remove offline cpus and mems from this cpuset. */ 2047 /* Remove offline cpus and mems from this cpuset. */
2020 mutex_lock(&callback_mutex); 2048 mutex_lock(&callback_mutex);
@@ -2030,9 +2058,10 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2030 remove_tasks_in_empty_cpuset(cp); 2058 remove_tasks_in_empty_cpuset(cp);
2031 else { 2059 else {
2032 update_tasks_cpumask(cp, NULL); 2060 update_tasks_cpumask(cp, NULL);
2033 update_tasks_nodemask(cp, &oldmems, NULL); 2061 update_tasks_nodemask(cp, oldmems, NULL);
2034 } 2062 }
2035 } 2063 }
2064 NODEMASK_FREE(oldmems);
2036} 2065}
2037 2066
2038/* 2067/*
@@ -2090,20 +2119,33 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
2090static int cpuset_track_online_nodes(struct notifier_block *self, 2119static int cpuset_track_online_nodes(struct notifier_block *self,
2091 unsigned long action, void *arg) 2120 unsigned long action, void *arg)
2092{ 2121{
2122 NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
2123
2124 if (oldmems == NULL)
2125 return NOTIFY_DONE;
2126
2093 cgroup_lock(); 2127 cgroup_lock();
2094 switch (action) { 2128 switch (action) {
2095 case MEM_ONLINE: 2129 case MEM_ONLINE:
2096 case MEM_OFFLINE: 2130 *oldmems = top_cpuset.mems_allowed;
2097 mutex_lock(&callback_mutex); 2131 mutex_lock(&callback_mutex);
2098 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2132 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2099 mutex_unlock(&callback_mutex); 2133 mutex_unlock(&callback_mutex);
2100 if (action == MEM_OFFLINE) 2134 update_tasks_nodemask(&top_cpuset, oldmems, NULL);
2101 scan_for_empty_cpusets(&top_cpuset); 2135 break;
2136 case MEM_OFFLINE:
2137 /*
2138 * needn't update top_cpuset.mems_allowed explicitly because
2139 * scan_for_empty_cpusets() will update it.
2140 */
2141 scan_for_empty_cpusets(&top_cpuset);
2102 break; 2142 break;
2103 default: 2143 default:
2104 break; 2144 break;
2105 } 2145 }
2106 cgroup_unlock(); 2146 cgroup_unlock();
2147
2148 NODEMASK_FREE(oldmems);
2107 return NOTIFY_OK; 2149 return NOTIFY_OK;
2108} 2150}
2109#endif 2151#endif
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 82ed0ea15194..83911c780175 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -219,7 +219,7 @@ int kthreadd(void *unused)
219 set_task_comm(tsk, "kthreadd"); 219 set_task_comm(tsk, "kthreadd");
220 ignore_signals(tsk); 220 ignore_signals(tsk);
221 set_cpus_allowed_ptr(tsk, cpu_all_mask); 221 set_cpus_allowed_ptr(tsk, cpu_all_mask);
222 set_mems_allowed(node_possible_map); 222 set_mems_allowed(node_states[N_HIGH_MEMORY]);
223 223
224 current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG; 224 current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
225 225
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 8e5ec5e1ab91..1fafb4b99c9b 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -103,7 +103,8 @@ config HEADERS_CHECK
103 103
104config DEBUG_SECTION_MISMATCH 104config DEBUG_SECTION_MISMATCH
105 bool "Enable full Section mismatch analysis" 105 bool "Enable full Section mismatch analysis"
106 depends on UNDEFINED 106 depends on UNDEFINED || (BLACKFIN)
107 default y
107 # This option is on purpose disabled for now. 108 # This option is on purpose disabled for now.
108 # It will be enabled when we are down to a reasonable number 109 # It will be enabled when we are down to a reasonable number
109 # of section mismatch warnings (< 10 for an allyesconfig build) 110 # of section mismatch warnings (< 10 for an allyesconfig build)
diff --git a/mm/ksm.c b/mm/ksm.c
index a93f1b7f508c..8cdfc2a1e8bf 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -751,7 +751,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
751 * page 751 * page
752 */ 752 */
753 if (page_mapcount(page) + 1 + swapped != page_count(page)) { 753 if (page_mapcount(page) + 1 + swapped != page_count(page)) {
754 set_pte_at_notify(mm, addr, ptep, entry); 754 set_pte_at(mm, addr, ptep, entry);
755 goto out_unlock; 755 goto out_unlock;
756 } 756 }
757 entry = pte_wrprotect(entry); 757 entry = pte_wrprotect(entry);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7973b5221fb8..9ed760dc7448 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3691,8 +3691,10 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
3691 else 3691 else
3692 mem = vmalloc(size); 3692 mem = vmalloc(size);
3693 3693
3694 if (mem) 3694 if (!mem)
3695 memset(mem, 0, size); 3695 return NULL;
3696
3697 memset(mem, 0, size);
3696 mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); 3698 mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
3697 if (!mem->stat) { 3699 if (!mem->stat) {
3698 if (size < PAGE_SIZE) 3700 if (size < PAGE_SIZE)
@@ -3946,28 +3948,6 @@ one_by_one:
3946 } 3948 }
3947 return ret; 3949 return ret;
3948} 3950}
3949#else /* !CONFIG_MMU */
3950static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
3951 struct cgroup *cgroup,
3952 struct task_struct *p,
3953 bool threadgroup)
3954{
3955 return 0;
3956}
3957static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
3958 struct cgroup *cgroup,
3959 struct task_struct *p,
3960 bool threadgroup)
3961{
3962}
3963static void mem_cgroup_move_task(struct cgroup_subsys *ss,
3964 struct cgroup *cont,
3965 struct cgroup *old_cont,
3966 struct task_struct *p,
3967 bool threadgroup)
3968{
3969}
3970#endif
3971 3951
3972/** 3952/**
3973 * is_target_pte_for_mc - check a pte whether it is valid for move charge 3953 * is_target_pte_for_mc - check a pte whether it is valid for move charge
@@ -4330,6 +4310,28 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
4330 } 4310 }
4331 mem_cgroup_clear_mc(); 4311 mem_cgroup_clear_mc();
4332} 4312}
4313#else /* !CONFIG_MMU */
4314static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
4315 struct cgroup *cgroup,
4316 struct task_struct *p,
4317 bool threadgroup)
4318{
4319 return 0;
4320}
4321static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
4322 struct cgroup *cgroup,
4323 struct task_struct *p,
4324 bool threadgroup)
4325{
4326}
4327static void mem_cgroup_move_task(struct cgroup_subsys *ss,
4328 struct cgroup *cont,
4329 struct cgroup *old_cont,
4330 struct task_struct *p,
4331 bool threadgroup)
4332{
4333}
4334#endif
4333 4335
4334struct cgroup_subsys mem_cgroup_subsys = { 4336struct cgroup_subsys mem_cgroup_subsys = {
4335 .name = "memory", 4337 .name = "memory",
diff --git a/mm/memory.c b/mm/memory.c
index 5b7f2002e54b..bc9ba5a1f5b9 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -130,6 +130,7 @@ void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm)
130 130
131 for (i = 0; i < NR_MM_COUNTERS; i++) { 131 for (i = 0; i < NR_MM_COUNTERS; i++) {
132 if (task->rss_stat.count[i]) { 132 if (task->rss_stat.count[i]) {
133 BUG_ON(!mm);
133 add_mm_counter(mm, i, task->rss_stat.count[i]); 134 add_mm_counter(mm, i, task->rss_stat.count[i]);
134 task->rss_stat.count[i] = 0; 135 task->rss_stat.count[i] = 0;
135 } 136 }
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 643f66e10187..8034abd3a135 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -806,9 +806,13 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
806 806
807 err = 0; 807 err = 0;
808 if (nmask) { 808 if (nmask) {
809 task_lock(current); 809 if (mpol_store_user_nodemask(pol)) {
810 get_policy_nodemask(pol, nmask); 810 *nmask = pol->w.user_nodemask;
811 task_unlock(current); 811 } else {
812 task_lock(current);
813 get_policy_nodemask(pol, nmask);
814 task_unlock(current);
815 }
812 } 816 }
813 817
814 out: 818 out:
@@ -2195,8 +2199,8 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2195 char *rest = nodelist; 2199 char *rest = nodelist;
2196 while (isdigit(*rest)) 2200 while (isdigit(*rest))
2197 rest++; 2201 rest++;
2198 if (!*rest) 2202 if (*rest)
2199 err = 0; 2203 goto out;
2200 } 2204 }
2201 break; 2205 break;
2202 case MPOL_INTERLEAVE: 2206 case MPOL_INTERLEAVE:
@@ -2205,7 +2209,6 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2205 */ 2209 */
2206 if (!nodelist) 2210 if (!nodelist)
2207 nodes = node_states[N_HIGH_MEMORY]; 2211 nodes = node_states[N_HIGH_MEMORY];
2208 err = 0;
2209 break; 2212 break;
2210 case MPOL_LOCAL: 2213 case MPOL_LOCAL:
2211 /* 2214 /*
@@ -2215,11 +2218,19 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2215 goto out; 2218 goto out;
2216 mode = MPOL_PREFERRED; 2219 mode = MPOL_PREFERRED;
2217 break; 2220 break;
2218 2221 case MPOL_DEFAULT:
2219 /* 2222 /*
2220 * case MPOL_BIND: mpol_new() enforces non-empty nodemask. 2223 * Insist on a empty nodelist
2221 * case MPOL_DEFAULT: mpol_new() enforces empty nodemask, ignores flags. 2224 */
2222 */ 2225 if (!nodelist)
2226 err = 0;
2227 goto out;
2228 case MPOL_BIND:
2229 /*
2230 * Insist on a nodelist
2231 */
2232 if (!nodelist)
2233 goto out;
2223 } 2234 }
2224 2235
2225 mode_flags = 0; 2236 mode_flags = 0;
@@ -2233,13 +2244,14 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2233 else if (!strcmp(flags, "relative")) 2244 else if (!strcmp(flags, "relative"))
2234 mode_flags |= MPOL_F_RELATIVE_NODES; 2245 mode_flags |= MPOL_F_RELATIVE_NODES;
2235 else 2246 else
2236 err = 1; 2247 goto out;
2237 } 2248 }
2238 2249
2239 new = mpol_new(mode, mode_flags, &nodes); 2250 new = mpol_new(mode, mode_flags, &nodes);
2240 if (IS_ERR(new)) 2251 if (IS_ERR(new))
2241 err = 1; 2252 goto out;
2242 else { 2253
2254 {
2243 int ret; 2255 int ret;
2244 NODEMASK_SCRATCH(scratch); 2256 NODEMASK_SCRATCH(scratch);
2245 if (scratch) { 2257 if (scratch) {
@@ -2250,13 +2262,15 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2250 ret = -ENOMEM; 2262 ret = -ENOMEM;
2251 NODEMASK_SCRATCH_FREE(scratch); 2263 NODEMASK_SCRATCH_FREE(scratch);
2252 if (ret) { 2264 if (ret) {
2253 err = 1;
2254 mpol_put(new); 2265 mpol_put(new);
2255 } else if (no_context) { 2266 goto out;
2256 /* save for contextualization */
2257 new->w.user_nodemask = nodes;
2258 } 2267 }
2259 } 2268 }
2269 err = 0;
2270 if (no_context) {
2271 /* save for contextualization */
2272 new->w.user_nodemask = nodes;
2273 }
2260 2274
2261out: 2275out:
2262 /* Restore string for error message */ 2276 /* Restore string for error message */
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
index 0777654147c9..9e82e937000e 100644
--- a/mm/mmu_context.c
+++ b/mm/mmu_context.c
@@ -53,6 +53,7 @@ void unuse_mm(struct mm_struct *mm)
53 struct task_struct *tsk = current; 53 struct task_struct *tsk = current;
54 54
55 task_lock(tsk); 55 task_lock(tsk);
56 sync_mm_rss(tsk, mm);
56 tsk->mm = NULL; 57 tsk->mm = NULL;
57 /* active_mm is still 'mm' */ 58 /* active_mm is still 'mm' */
58 enter_lazy_tlb(mm, tsk); 59 enter_lazy_tlb(mm, tsk);
diff --git a/mm/nommu.c b/mm/nommu.c
index 605ace8982a8..e4b8f4d28a3f 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1040,10 +1040,9 @@ static int do_mmap_shared_file(struct vm_area_struct *vma)
1040 if (ret != -ENOSYS) 1040 if (ret != -ENOSYS)
1041 return ret; 1041 return ret;
1042 1042
1043 /* getting an ENOSYS error indicates that direct mmap isn't 1043 /* getting -ENOSYS indicates that direct mmap isn't possible (as
1044 * possible (as opposed to tried but failed) so we'll fall 1044 * opposed to tried but failed) so we can only give a suitable error as
1045 * through to making a private copy of the data and mapping 1045 * it's not possible to make a private copy if MAP_SHARED was given */
1046 * that if we can */
1047 return -ENODEV; 1046 return -ENODEV;
1048} 1047}
1049 1048
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index c0316e0ca6e8..c584a0af77d3 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -11,7 +11,7 @@ int __vlan_hwaccel_rx(struct sk_buff *skb, struct vlan_group *grp,
11 if (netpoll_rx(skb)) 11 if (netpoll_rx(skb))
12 return NET_RX_DROP; 12 return NET_RX_DROP;
13 13
14 if (skb_bond_should_drop(skb)) 14 if (skb_bond_should_drop(skb, ACCESS_ONCE(skb->dev->master)))
15 goto drop; 15 goto drop;
16 16
17 skb->skb_iif = skb->dev->ifindex; 17 skb->skb_iif = skb->dev->ifindex;
@@ -83,7 +83,7 @@ vlan_gro_common(struct napi_struct *napi, struct vlan_group *grp,
83{ 83{
84 struct sk_buff *p; 84 struct sk_buff *p;
85 85
86 if (skb_bond_should_drop(skb)) 86 if (skb_bond_should_drop(skb, ACCESS_ONCE(skb->dev->master)))
87 goto drop; 87 goto drop;
88 88
89 skb->skb_iif = skb->dev->ifindex; 89 skb->skb_iif = skb->dev->ifindex;
diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c
index cafb55b0cea5..05fd125f74fe 100644
--- a/net/bluetooth/hci_sysfs.c
+++ b/net/bluetooth/hci_sysfs.c
@@ -8,8 +8,7 @@
8#include <net/bluetooth/bluetooth.h> 8#include <net/bluetooth/bluetooth.h>
9#include <net/bluetooth/hci_core.h> 9#include <net/bluetooth/hci_core.h>
10 10
11struct class *bt_class = NULL; 11static struct class *bt_class;
12EXPORT_SYMBOL_GPL(bt_class);
13 12
14struct dentry *bt_debugfs = NULL; 13struct dentry *bt_debugfs = NULL;
15EXPORT_SYMBOL_GPL(bt_debugfs); 14EXPORT_SYMBOL_GPL(bt_debugfs);
diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c
index 4db7ae2fe07d..7794a2e2adce 100644
--- a/net/bluetooth/l2cap.c
+++ b/net/bluetooth/l2cap.c
@@ -40,6 +40,8 @@
40#include <linux/skbuff.h> 40#include <linux/skbuff.h>
41#include <linux/list.h> 41#include <linux/list.h>
42#include <linux/device.h> 42#include <linux/device.h>
43#include <linux/debugfs.h>
44#include <linux/seq_file.h>
43#include <linux/uaccess.h> 45#include <linux/uaccess.h>
44#include <linux/crc16.h> 46#include <linux/crc16.h>
45#include <net/sock.h> 47#include <net/sock.h>
@@ -2830,6 +2832,11 @@ static inline int l2cap_config_rsp(struct l2cap_conn *conn, struct l2cap_cmd_hdr
2830 int len = cmd->len - sizeof(*rsp); 2832 int len = cmd->len - sizeof(*rsp);
2831 char req[64]; 2833 char req[64];
2832 2834
2835 if (len > sizeof(req) - sizeof(struct l2cap_conf_req)) {
2836 l2cap_send_disconn_req(conn, sk);
2837 goto done;
2838 }
2839
2833 /* throw out any old stored conf requests */ 2840 /* throw out any old stored conf requests */
2834 result = L2CAP_CONF_SUCCESS; 2841 result = L2CAP_CONF_SUCCESS;
2835 len = l2cap_parse_conf_rsp(sk, rsp->data, 2842 len = l2cap_parse_conf_rsp(sk, rsp->data,
@@ -3937,31 +3944,42 @@ drop:
3937 return 0; 3944 return 0;
3938} 3945}
3939 3946
3940static ssize_t l2cap_sysfs_show(struct class *dev, 3947static int l2cap_debugfs_show(struct seq_file *f, void *p)
3941 struct class_attribute *attr,
3942 char *buf)
3943{ 3948{
3944 struct sock *sk; 3949 struct sock *sk;
3945 struct hlist_node *node; 3950 struct hlist_node *node;
3946 char *str = buf;
3947 3951
3948 read_lock_bh(&l2cap_sk_list.lock); 3952 read_lock_bh(&l2cap_sk_list.lock);
3949 3953
3950 sk_for_each(sk, node, &l2cap_sk_list.head) { 3954 sk_for_each(sk, node, &l2cap_sk_list.head) {
3951 struct l2cap_pinfo *pi = l2cap_pi(sk); 3955 struct l2cap_pinfo *pi = l2cap_pi(sk);
3952 3956
3953 str += sprintf(str, "%s %s %d %d 0x%4.4x 0x%4.4x %d %d %d\n", 3957 seq_printf(f, "%s %s %d %d 0x%4.4x 0x%4.4x %d %d %d\n",
3954 batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst), 3958 batostr(&bt_sk(sk)->src),
3955 sk->sk_state, __le16_to_cpu(pi->psm), pi->scid, 3959 batostr(&bt_sk(sk)->dst),
3956 pi->dcid, pi->imtu, pi->omtu, pi->sec_level); 3960 sk->sk_state, __le16_to_cpu(pi->psm),
3961 pi->scid, pi->dcid,
3962 pi->imtu, pi->omtu, pi->sec_level);
3957 } 3963 }
3958 3964
3959 read_unlock_bh(&l2cap_sk_list.lock); 3965 read_unlock_bh(&l2cap_sk_list.lock);
3960 3966
3961 return str - buf; 3967 return 0;
3962} 3968}
3963 3969
3964static CLASS_ATTR(l2cap, S_IRUGO, l2cap_sysfs_show, NULL); 3970static int l2cap_debugfs_open(struct inode *inode, struct file *file)
3971{
3972 return single_open(file, l2cap_debugfs_show, inode->i_private);
3973}
3974
3975static const struct file_operations l2cap_debugfs_fops = {
3976 .open = l2cap_debugfs_open,
3977 .read = seq_read,
3978 .llseek = seq_lseek,
3979 .release = single_release,
3980};
3981
3982static struct dentry *l2cap_debugfs;
3965 3983
3966static const struct proto_ops l2cap_sock_ops = { 3984static const struct proto_ops l2cap_sock_ops = {
3967 .family = PF_BLUETOOTH, 3985 .family = PF_BLUETOOTH,
@@ -4021,8 +4039,12 @@ static int __init l2cap_init(void)
4021 goto error; 4039 goto error;
4022 } 4040 }
4023 4041
4024 if (class_create_file(bt_class, &class_attr_l2cap) < 0) 4042 if (bt_debugfs) {
4025 BT_ERR("Failed to create L2CAP info file"); 4043 l2cap_debugfs = debugfs_create_file("l2cap", 0444,
4044 bt_debugfs, NULL, &l2cap_debugfs_fops);
4045 if (!l2cap_debugfs)
4046 BT_ERR("Failed to create L2CAP debug file");
4047 }
4026 4048
4027 BT_INFO("L2CAP ver %s", VERSION); 4049 BT_INFO("L2CAP ver %s", VERSION);
4028 BT_INFO("L2CAP socket layer initialized"); 4050 BT_INFO("L2CAP socket layer initialized");
@@ -4036,7 +4058,7 @@ error:
4036 4058
4037static void __exit l2cap_exit(void) 4059static void __exit l2cap_exit(void)
4038{ 4060{
4039 class_remove_file(bt_class, &class_attr_l2cap); 4061 debugfs_remove(l2cap_debugfs);
4040 4062
4041 if (bt_sock_unregister(BTPROTO_L2CAP) < 0) 4063 if (bt_sock_unregister(BTPROTO_L2CAP) < 0)
4042 BT_ERR("L2CAP socket unregistration failed"); 4064 BT_ERR("L2CAP socket unregistration failed");
diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c
index db8a68e1a5ba..13f114e8b0f9 100644
--- a/net/bluetooth/rfcomm/core.c
+++ b/net/bluetooth/rfcomm/core.c
@@ -33,6 +33,8 @@
33#include <linux/init.h> 33#include <linux/init.h>
34#include <linux/wait.h> 34#include <linux/wait.h>
35#include <linux/device.h> 35#include <linux/device.h>
36#include <linux/debugfs.h>
37#include <linux/seq_file.h>
36#include <linux/net.h> 38#include <linux/net.h>
37#include <linux/mutex.h> 39#include <linux/mutex.h>
38#include <linux/kthread.h> 40#include <linux/kthread.h>
@@ -2098,13 +2100,10 @@ static struct hci_cb rfcomm_cb = {
2098 .security_cfm = rfcomm_security_cfm 2100 .security_cfm = rfcomm_security_cfm
2099}; 2101};
2100 2102
2101static ssize_t rfcomm_dlc_sysfs_show(struct class *dev, 2103static int rfcomm_dlc_debugfs_show(struct seq_file *f, void *x)
2102 struct class_attribute *attr,
2103 char *buf)
2104{ 2104{
2105 struct rfcomm_session *s; 2105 struct rfcomm_session *s;
2106 struct list_head *pp, *p; 2106 struct list_head *pp, *p;
2107 char *str = buf;
2108 2107
2109 rfcomm_lock(); 2108 rfcomm_lock();
2110 2109
@@ -2114,18 +2113,32 @@ static ssize_t rfcomm_dlc_sysfs_show(struct class *dev,
2114 struct sock *sk = s->sock->sk; 2113 struct sock *sk = s->sock->sk;
2115 struct rfcomm_dlc *d = list_entry(pp, struct rfcomm_dlc, list); 2114 struct rfcomm_dlc *d = list_entry(pp, struct rfcomm_dlc, list);
2116 2115
2117 str += sprintf(str, "%s %s %ld %d %d %d %d\n", 2116 seq_printf(f, "%s %s %ld %d %d %d %d\n",
2118 batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst), 2117 batostr(&bt_sk(sk)->src),
2119 d->state, d->dlci, d->mtu, d->rx_credits, d->tx_credits); 2118 batostr(&bt_sk(sk)->dst),
2119 d->state, d->dlci, d->mtu,
2120 d->rx_credits, d->tx_credits);
2120 } 2121 }
2121 } 2122 }
2122 2123
2123 rfcomm_unlock(); 2124 rfcomm_unlock();
2124 2125
2125 return (str - buf); 2126 return 0;
2126} 2127}
2127 2128
2128static CLASS_ATTR(rfcomm_dlc, S_IRUGO, rfcomm_dlc_sysfs_show, NULL); 2129static int rfcomm_dlc_debugfs_open(struct inode *inode, struct file *file)
2130{
2131 return single_open(file, rfcomm_dlc_debugfs_show, inode->i_private);
2132}
2133
2134static const struct file_operations rfcomm_dlc_debugfs_fops = {
2135 .open = rfcomm_dlc_debugfs_open,
2136 .read = seq_read,
2137 .llseek = seq_lseek,
2138 .release = single_release,
2139};
2140
2141static struct dentry *rfcomm_dlc_debugfs;
2129 2142
2130/* ---- Initialization ---- */ 2143/* ---- Initialization ---- */
2131static int __init rfcomm_init(void) 2144static int __init rfcomm_init(void)
@@ -2142,8 +2155,12 @@ static int __init rfcomm_init(void)
2142 goto unregister; 2155 goto unregister;
2143 } 2156 }
2144 2157
2145 if (class_create_file(bt_class, &class_attr_rfcomm_dlc) < 0) 2158 if (bt_debugfs) {
2146 BT_ERR("Failed to create RFCOMM info file"); 2159 rfcomm_dlc_debugfs = debugfs_create_file("rfcomm_dlc", 0444,
2160 bt_debugfs, NULL, &rfcomm_dlc_debugfs_fops);
2161 if (!rfcomm_dlc_debugfs)
2162 BT_ERR("Failed to create RFCOMM debug file");
2163 }
2147 2164
2148 err = rfcomm_init_ttys(); 2165 err = rfcomm_init_ttys();
2149 if (err < 0) 2166 if (err < 0)
@@ -2171,7 +2188,7 @@ unregister:
2171 2188
2172static void __exit rfcomm_exit(void) 2189static void __exit rfcomm_exit(void)
2173{ 2190{
2174 class_remove_file(bt_class, &class_attr_rfcomm_dlc); 2191 debugfs_remove(rfcomm_dlc_debugfs);
2175 2192
2176 hci_unregister_cb(&rfcomm_cb); 2193 hci_unregister_cb(&rfcomm_cb);
2177 2194
diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c
index ca87d6ac6a20..7f439765403d 100644
--- a/net/bluetooth/rfcomm/sock.c
+++ b/net/bluetooth/rfcomm/sock.c
@@ -40,6 +40,8 @@
40#include <linux/skbuff.h> 40#include <linux/skbuff.h>
41#include <linux/list.h> 41#include <linux/list.h>
42#include <linux/device.h> 42#include <linux/device.h>
43#include <linux/debugfs.h>
44#include <linux/seq_file.h>
43#include <net/sock.h> 45#include <net/sock.h>
44 46
45#include <asm/system.h> 47#include <asm/system.h>
@@ -1061,28 +1063,38 @@ done:
1061 return result; 1063 return result;
1062} 1064}
1063 1065
1064static ssize_t rfcomm_sock_sysfs_show(struct class *dev, 1066static int rfcomm_sock_debugfs_show(struct seq_file *f, void *p)
1065 struct class_attribute *attr,
1066 char *buf)
1067{ 1067{
1068 struct sock *sk; 1068 struct sock *sk;
1069 struct hlist_node *node; 1069 struct hlist_node *node;
1070 char *str = buf;
1071 1070
1072 read_lock_bh(&rfcomm_sk_list.lock); 1071 read_lock_bh(&rfcomm_sk_list.lock);
1073 1072
1074 sk_for_each(sk, node, &rfcomm_sk_list.head) { 1073 sk_for_each(sk, node, &rfcomm_sk_list.head) {
1075 str += sprintf(str, "%s %s %d %d\n", 1074 seq_printf(f, "%s %s %d %d\n",
1076 batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst), 1075 batostr(&bt_sk(sk)->src),
1076 batostr(&bt_sk(sk)->dst),
1077 sk->sk_state, rfcomm_pi(sk)->channel); 1077 sk->sk_state, rfcomm_pi(sk)->channel);
1078 } 1078 }
1079 1079
1080 read_unlock_bh(&rfcomm_sk_list.lock); 1080 read_unlock_bh(&rfcomm_sk_list.lock);
1081 1081
1082 return (str - buf); 1082 return 0;
1083} 1083}
1084 1084
1085static CLASS_ATTR(rfcomm, S_IRUGO, rfcomm_sock_sysfs_show, NULL); 1085static int rfcomm_sock_debugfs_open(struct inode *inode, struct file *file)
1086{
1087 return single_open(file, rfcomm_sock_debugfs_show, inode->i_private);
1088}
1089
1090static const struct file_operations rfcomm_sock_debugfs_fops = {
1091 .open = rfcomm_sock_debugfs_open,
1092 .read = seq_read,
1093 .llseek = seq_lseek,
1094 .release = single_release,
1095};
1096
1097static struct dentry *rfcomm_sock_debugfs;
1086 1098
1087static const struct proto_ops rfcomm_sock_ops = { 1099static const struct proto_ops rfcomm_sock_ops = {
1088 .family = PF_BLUETOOTH, 1100 .family = PF_BLUETOOTH,
@@ -1122,8 +1134,12 @@ int __init rfcomm_init_sockets(void)
1122 if (err < 0) 1134 if (err < 0)
1123 goto error; 1135 goto error;
1124 1136
1125 if (class_create_file(bt_class, &class_attr_rfcomm) < 0) 1137 if (bt_debugfs) {
1126 BT_ERR("Failed to create RFCOMM info file"); 1138 rfcomm_sock_debugfs = debugfs_create_file("rfcomm", 0444,
1139 bt_debugfs, NULL, &rfcomm_sock_debugfs_fops);
1140 if (!rfcomm_sock_debugfs)
1141 BT_ERR("Failed to create RFCOMM debug file");
1142 }
1127 1143
1128 BT_INFO("RFCOMM socket layer initialized"); 1144 BT_INFO("RFCOMM socket layer initialized");
1129 1145
@@ -1137,7 +1153,7 @@ error:
1137 1153
1138void rfcomm_cleanup_sockets(void) 1154void rfcomm_cleanup_sockets(void)
1139{ 1155{
1140 class_remove_file(bt_class, &class_attr_rfcomm); 1156 debugfs_remove(rfcomm_sock_debugfs);
1141 1157
1142 if (bt_sock_unregister(BTPROTO_RFCOMM) < 0) 1158 if (bt_sock_unregister(BTPROTO_RFCOMM) < 0)
1143 BT_ERR("RFCOMM socket layer unregistration failed"); 1159 BT_ERR("RFCOMM socket layer unregistration failed");
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index f93b939539bc..e5b16b76b22e 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -38,6 +38,8 @@
38#include <linux/socket.h> 38#include <linux/socket.h>
39#include <linux/skbuff.h> 39#include <linux/skbuff.h>
40#include <linux/device.h> 40#include <linux/device.h>
41#include <linux/debugfs.h>
42#include <linux/seq_file.h>
41#include <linux/list.h> 43#include <linux/list.h>
42#include <net/sock.h> 44#include <net/sock.h>
43 45
@@ -953,28 +955,36 @@ drop:
953 return 0; 955 return 0;
954} 956}
955 957
956static ssize_t sco_sysfs_show(struct class *dev, 958static int sco_debugfs_show(struct seq_file *f, void *p)
957 struct class_attribute *attr,
958 char *buf)
959{ 959{
960 struct sock *sk; 960 struct sock *sk;
961 struct hlist_node *node; 961 struct hlist_node *node;
962 char *str = buf;
963 962
964 read_lock_bh(&sco_sk_list.lock); 963 read_lock_bh(&sco_sk_list.lock);
965 964
966 sk_for_each(sk, node, &sco_sk_list.head) { 965 sk_for_each(sk, node, &sco_sk_list.head) {
967 str += sprintf(str, "%s %s %d\n", 966 seq_printf(f, "%s %s %d\n", batostr(&bt_sk(sk)->src),
968 batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst), 967 batostr(&bt_sk(sk)->dst), sk->sk_state);
969 sk->sk_state);
970 } 968 }
971 969
972 read_unlock_bh(&sco_sk_list.lock); 970 read_unlock_bh(&sco_sk_list.lock);
973 971
974 return (str - buf); 972 return 0;
975} 973}
976 974
977static CLASS_ATTR(sco, S_IRUGO, sco_sysfs_show, NULL); 975static int sco_debugfs_open(struct inode *inode, struct file *file)
976{
977 return single_open(file, sco_debugfs_show, inode->i_private);
978}
979
980static const struct file_operations sco_debugfs_fops = {
981 .open = sco_debugfs_open,
982 .read = seq_read,
983 .llseek = seq_lseek,
984 .release = single_release,
985};
986
987static struct dentry *sco_debugfs;
978 988
979static const struct proto_ops sco_sock_ops = { 989static const struct proto_ops sco_sock_ops = {
980 .family = PF_BLUETOOTH, 990 .family = PF_BLUETOOTH,
@@ -1032,8 +1042,12 @@ static int __init sco_init(void)
1032 goto error; 1042 goto error;
1033 } 1043 }
1034 1044
1035 if (class_create_file(bt_class, &class_attr_sco) < 0) 1045 if (bt_debugfs) {
1036 BT_ERR("Failed to create SCO info file"); 1046 sco_debugfs = debugfs_create_file("sco", 0444,
1047 bt_debugfs, NULL, &sco_debugfs_fops);
1048 if (!sco_debugfs)
1049 BT_ERR("Failed to create SCO debug file");
1050 }
1037 1051
1038 BT_INFO("SCO (Voice Link) ver %s", VERSION); 1052 BT_INFO("SCO (Voice Link) ver %s", VERSION);
1039 BT_INFO("SCO socket layer initialized"); 1053 BT_INFO("SCO socket layer initialized");
@@ -1047,7 +1061,7 @@ error:
1047 1061
1048static void __exit sco_exit(void) 1062static void __exit sco_exit(void)
1049{ 1063{
1050 class_remove_file(bt_class, &class_attr_sco); 1064 debugfs_remove(sco_debugfs);
1051 1065
1052 if (bt_sock_unregister(BTPROTO_SCO) < 0) 1066 if (bt_sock_unregister(BTPROTO_SCO) < 0)
1053 BT_ERR("SCO socket unregistration failed"); 1067 BT_ERR("SCO socket unregistration failed");
diff --git a/net/core/dev.c b/net/core/dev.c
index bcc490cc9452..59d4394d2ce8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2483,6 +2483,7 @@ int netif_receive_skb(struct sk_buff *skb)
2483{ 2483{
2484 struct packet_type *ptype, *pt_prev; 2484 struct packet_type *ptype, *pt_prev;
2485 struct net_device *orig_dev; 2485 struct net_device *orig_dev;
2486 struct net_device *master;
2486 struct net_device *null_or_orig; 2487 struct net_device *null_or_orig;
2487 struct net_device *null_or_bond; 2488 struct net_device *null_or_bond;
2488 int ret = NET_RX_DROP; 2489 int ret = NET_RX_DROP;
@@ -2503,11 +2504,12 @@ int netif_receive_skb(struct sk_buff *skb)
2503 2504
2504 null_or_orig = NULL; 2505 null_or_orig = NULL;
2505 orig_dev = skb->dev; 2506 orig_dev = skb->dev;
2506 if (orig_dev->master) { 2507 master = ACCESS_ONCE(orig_dev->master);
2507 if (skb_bond_should_drop(skb)) 2508 if (master) {
2509 if (skb_bond_should_drop(skb, master))
2508 null_or_orig = orig_dev; /* deliver only exact match */ 2510 null_or_orig = orig_dev; /* deliver only exact match */
2509 else 2511 else
2510 skb->dev = orig_dev->master; 2512 skb->dev = master;
2511 } 2513 }
2512 2514
2513 __get_cpu_var(netdev_rx_stat).total++; 2515 __get_cpu_var(netdev_rx_stat).total++;
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index af5d89792860..01ef8ba9025c 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -961,7 +961,9 @@ fib_find_node(struct trie *t, u32 key)
961 struct node *n; 961 struct node *n;
962 962
963 pos = 0; 963 pos = 0;
964 n = rcu_dereference(t->trie); 964 n = rcu_dereference_check(t->trie,
965 rcu_read_lock_held() ||
966 lockdep_rtnl_is_held());
965 967
966 while (n != NULL && NODE_TYPE(n) == T_TNODE) { 968 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
967 tn = (struct tnode *) n; 969 tn = (struct tnode *) n;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index f47c9f76754b..f78402d097b3 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -810,11 +810,13 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev
810 tunnel->err_count = 0; 810 tunnel->err_count = 0;
811 } 811 }
812 812
813 max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen; 813 max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->u.dst.header_len;
814 814
815 if (skb_headroom(skb) < max_headroom || skb_shared(skb)|| 815 if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
816 (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { 816 (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
817 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); 817 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
818 if (max_headroom > dev->needed_headroom)
819 dev->needed_headroom = max_headroom;
818 if (!new_skb) { 820 if (!new_skb) {
819 ip_rt_put(rt); 821 ip_rt_put(rt);
820 txq->tx_dropped++; 822 txq->tx_dropped++;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 8582e12e4a62..0b9d03c54dc3 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -802,6 +802,9 @@ static int ipmr_mfc_add(struct net *net, struct mfcctl *mfc, int mrtsock)
802 int line; 802 int line;
803 struct mfc_cache *uc, *c, **cp; 803 struct mfc_cache *uc, *c, **cp;
804 804
805 if (mfc->mfcc_parent >= MAXVIFS)
806 return -ENFILE;
807
805 line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr); 808 line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
806 809
807 for (cp = &net->ipv4.mfc_cache_array[line]; 810 for (cp = &net->ipv4.mfc_cache_array[line];
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index a770df2493d2..54fd68c14c87 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1441,7 +1441,7 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1441 dev_hold(rt->u.dst.dev); 1441 dev_hold(rt->u.dst.dev);
1442 if (rt->idev) 1442 if (rt->idev)
1443 in_dev_hold(rt->idev); 1443 in_dev_hold(rt->idev);
1444 rt->u.dst.obsolete = 0; 1444 rt->u.dst.obsolete = -1;
1445 rt->u.dst.lastuse = jiffies; 1445 rt->u.dst.lastuse = jiffies;
1446 rt->u.dst.path = &rt->u.dst; 1446 rt->u.dst.path = &rt->u.dst;
1447 rt->u.dst.neighbour = NULL; 1447 rt->u.dst.neighbour = NULL;
@@ -1506,11 +1506,12 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1506 struct dst_entry *ret = dst; 1506 struct dst_entry *ret = dst;
1507 1507
1508 if (rt) { 1508 if (rt) {
1509 if (dst->obsolete) { 1509 if (dst->obsolete > 0) {
1510 ip_rt_put(rt); 1510 ip_rt_put(rt);
1511 ret = NULL; 1511 ret = NULL;
1512 } else if ((rt->rt_flags & RTCF_REDIRECTED) || 1512 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1513 rt->u.dst.expires) { 1513 (rt->u.dst.expires &&
1514 time_after_eq(jiffies, rt->u.dst.expires))) {
1514 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src, 1515 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1515 rt->fl.oif, 1516 rt->fl.oif,
1516 rt_genid(dev_net(dst->dev))); 1517 rt_genid(dev_net(dst->dev)));
@@ -1726,7 +1727,9 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1726 1727
1727static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) 1728static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1728{ 1729{
1729 return NULL; 1730 if (rt_is_expired((struct rtable *)dst))
1731 return NULL;
1732 return dst;
1730} 1733}
1731 1734
1732static void ipv4_dst_destroy(struct dst_entry *dst) 1735static void ipv4_dst_destroy(struct dst_entry *dst)
@@ -1888,7 +1891,8 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1888 if (!rth) 1891 if (!rth)
1889 goto e_nobufs; 1892 goto e_nobufs;
1890 1893
1891 rth->u.dst.output= ip_rt_bug; 1894 rth->u.dst.output = ip_rt_bug;
1895 rth->u.dst.obsolete = -1;
1892 1896
1893 atomic_set(&rth->u.dst.__refcnt, 1); 1897 atomic_set(&rth->u.dst.__refcnt, 1);
1894 rth->u.dst.flags= DST_HOST; 1898 rth->u.dst.flags= DST_HOST;
@@ -2054,6 +2058,7 @@ static int __mkroute_input(struct sk_buff *skb,
2054 rth->fl.oif = 0; 2058 rth->fl.oif = 0;
2055 rth->rt_spec_dst= spec_dst; 2059 rth->rt_spec_dst= spec_dst;
2056 2060
2061 rth->u.dst.obsolete = -1;
2057 rth->u.dst.input = ip_forward; 2062 rth->u.dst.input = ip_forward;
2058 rth->u.dst.output = ip_output; 2063 rth->u.dst.output = ip_output;
2059 rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev)); 2064 rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
@@ -2218,6 +2223,7 @@ local_input:
2218 goto e_nobufs; 2223 goto e_nobufs;
2219 2224
2220 rth->u.dst.output= ip_rt_bug; 2225 rth->u.dst.output= ip_rt_bug;
2226 rth->u.dst.obsolete = -1;
2221 rth->rt_genid = rt_genid(net); 2227 rth->rt_genid = rt_genid(net);
2222 2228
2223 atomic_set(&rth->u.dst.__refcnt, 1); 2229 atomic_set(&rth->u.dst.__refcnt, 1);
@@ -2444,6 +2450,7 @@ static int __mkroute_output(struct rtable **result,
2444 rth->rt_spec_dst= fl->fl4_src; 2450 rth->rt_spec_dst= fl->fl4_src;
2445 2451
2446 rth->u.dst.output=ip_output; 2452 rth->u.dst.output=ip_output;
2453 rth->u.dst.obsolete = -1;
2447 rth->rt_genid = rt_genid(dev_net(dev_out)); 2454 rth->rt_genid = rt_genid(dev_net(dev_out));
2448 2455
2449 RT_CACHE_STAT_INC(out_slow_tot); 2456 RT_CACHE_STAT_INC(out_slow_tot);
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 5901010fad55..6afb6d8662b2 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -429,7 +429,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
429 if (tp->urg_seq == tp->copied_seq && 429 if (tp->urg_seq == tp->copied_seq &&
430 !sock_flag(sk, SOCK_URGINLINE) && 430 !sock_flag(sk, SOCK_URGINLINE) &&
431 tp->urg_data) 431 tp->urg_data)
432 target--; 432 target++;
433 433
434 /* Potential race condition. If read of tp below will 434 /* Potential race condition. If read of tp below will
435 * escape above sk->sk_state, we can be illegally awaken 435 * escape above sk->sk_state, we can be illegally awaken
@@ -1254,6 +1254,39 @@ static void tcp_prequeue_process(struct sock *sk)
1254 tp->ucopy.memory = 0; 1254 tp->ucopy.memory = 0;
1255} 1255}
1256 1256
1257#ifdef CONFIG_NET_DMA
1258static void tcp_service_net_dma(struct sock *sk, bool wait)
1259{
1260 dma_cookie_t done, used;
1261 dma_cookie_t last_issued;
1262 struct tcp_sock *tp = tcp_sk(sk);
1263
1264 if (!tp->ucopy.dma_chan)
1265 return;
1266
1267 last_issued = tp->ucopy.dma_cookie;
1268 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1269
1270 do {
1271 if (dma_async_memcpy_complete(tp->ucopy.dma_chan,
1272 last_issued, &done,
1273 &used) == DMA_SUCCESS) {
1274 /* Safe to free early-copied skbs now */
1275 __skb_queue_purge(&sk->sk_async_wait_queue);
1276 break;
1277 } else {
1278 struct sk_buff *skb;
1279 while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
1280 (dma_async_is_complete(skb->dma_cookie, done,
1281 used) == DMA_SUCCESS)) {
1282 __skb_dequeue(&sk->sk_async_wait_queue);
1283 kfree_skb(skb);
1284 }
1285 }
1286 } while (wait);
1287}
1288#endif
1289
1257static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) 1290static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1258{ 1291{
1259 struct sk_buff *skb; 1292 struct sk_buff *skb;
@@ -1546,6 +1579,10 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1546 /* __ Set realtime policy in scheduler __ */ 1579 /* __ Set realtime policy in scheduler __ */
1547 } 1580 }
1548 1581
1582#ifdef CONFIG_NET_DMA
1583 if (tp->ucopy.dma_chan)
1584 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1585#endif
1549 if (copied >= target) { 1586 if (copied >= target) {
1550 /* Do not sleep, just process backlog. */ 1587 /* Do not sleep, just process backlog. */
1551 release_sock(sk); 1588 release_sock(sk);
@@ -1554,6 +1591,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
1554 sk_wait_data(sk, &timeo); 1591 sk_wait_data(sk, &timeo);
1555 1592
1556#ifdef CONFIG_NET_DMA 1593#ifdef CONFIG_NET_DMA
1594 tcp_service_net_dma(sk, false); /* Don't block */
1557 tp->ucopy.wakeup = 0; 1595 tp->ucopy.wakeup = 0;
1558#endif 1596#endif
1559 1597
@@ -1633,6 +1671,9 @@ do_prequeue:
1633 copied = -EFAULT; 1671 copied = -EFAULT;
1634 break; 1672 break;
1635 } 1673 }
1674
1675 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1676
1636 if ((offset + used) == skb->len) 1677 if ((offset + used) == skb->len)
1637 copied_early = 1; 1678 copied_early = 1;
1638 1679
@@ -1702,27 +1743,9 @@ skip_copy:
1702 } 1743 }
1703 1744
1704#ifdef CONFIG_NET_DMA 1745#ifdef CONFIG_NET_DMA
1705 if (tp->ucopy.dma_chan) { 1746 tcp_service_net_dma(sk, true); /* Wait for queue to drain */
1706 dma_cookie_t done, used; 1747 tp->ucopy.dma_chan = NULL;
1707
1708 dma_async_memcpy_issue_pending(tp->ucopy.dma_chan);
1709
1710 while (dma_async_memcpy_complete(tp->ucopy.dma_chan,
1711 tp->ucopy.dma_cookie, &done,
1712 &used) == DMA_IN_PROGRESS) {
1713 /* do partial cleanup of sk_async_wait_queue */
1714 while ((skb = skb_peek(&sk->sk_async_wait_queue)) &&
1715 (dma_async_is_complete(skb->dma_cookie, done,
1716 used) == DMA_SUCCESS)) {
1717 __skb_dequeue(&sk->sk_async_wait_queue);
1718 kfree_skb(skb);
1719 }
1720 }
1721 1748
1722 /* Safe to free early-copied skbs now */
1723 __skb_queue_purge(&sk->sk_async_wait_queue);
1724 tp->ucopy.dma_chan = NULL;
1725 }
1726 if (tp->ucopy.pinned_list) { 1749 if (tp->ucopy.pinned_list) {
1727 dma_unpin_iovec_pages(tp->ucopy.pinned_list); 1750 dma_unpin_iovec_pages(tp->ucopy.pinned_list);
1728 tp->ucopy.pinned_list = NULL; 1751 tp->ucopy.pinned_list = NULL;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 788851ca8c5d..c096a4218b8f 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2511,6 +2511,9 @@ static void tcp_mark_head_lost(struct sock *sk, int packets)
2511 int err; 2511 int err;
2512 unsigned int mss; 2512 unsigned int mss;
2513 2513
2514 if (packets == 0)
2515 return;
2516
2514 WARN_ON(packets > tp->packets_out); 2517 WARN_ON(packets > tp->packets_out);
2515 if (tp->lost_skb_hint) { 2518 if (tp->lost_skb_hint) {
2516 skb = tp->lost_skb_hint; 2519 skb = tp->lost_skb_hint;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 70df40980a87..f4df5f931f36 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -370,6 +370,11 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
370 if (sk->sk_state == TCP_CLOSE) 370 if (sk->sk_state == TCP_CLOSE)
371 goto out; 371 goto out;
372 372
373 if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
374 NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
375 goto out;
376 }
377
373 icsk = inet_csk(sk); 378 icsk = inet_csk(sk);
374 tp = tcp_sk(sk); 379 tp = tcp_sk(sk);
375 seq = ntohl(th->seq); 380 seq = ntohl(th->seq);
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index 52e0f74fdfe0..23e4ac0cc30e 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -1113,6 +1113,9 @@ static int ip6mr_mfc_add(struct net *net, struct mf6cctl *mfc, int mrtsock)
1113 unsigned char ttls[MAXMIFS]; 1113 unsigned char ttls[MAXMIFS];
1114 int i; 1114 int i;
1115 1115
1116 if (mfc->mf6cc_parent >= MAXMIFS)
1117 return -ENFILE;
1118
1116 memset(ttls, 255, MAXMIFS); 1119 memset(ttls, 255, MAXMIFS);
1117 for (i = 0; i < MAXMIFS; i++) { 1120 for (i = 0; i < MAXMIFS; i++) {
1118 if (IF_ISSET(i, &mfc->mf6cc_ifset)) 1121 if (IF_ISSET(i, &mfc->mf6cc_ifset))
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 52cd3eff31dc..7fcb0e5d1213 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -879,7 +879,7 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
879 879
880 rt = (struct rt6_info *) dst; 880 rt = (struct rt6_info *) dst;
881 881
882 if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) 882 if (rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
883 return dst; 883 return dst;
884 884
885 return NULL; 885 return NULL;
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 2b2af631d2b8..569410a85953 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -582,7 +582,9 @@ nla_put_failure:
582nlmsg_failure: 582nlmsg_failure:
583 kfree_skb(skb); 583 kfree_skb(skb);
584errout: 584errout:
585 nfnetlink_set_err(net, 0, group, -ENOBUFS); 585 if (nfnetlink_set_err(net, 0, group, -ENOBUFS) > 0)
586 return -ENOBUFS;
587
586 return 0; 588 return 0;
587} 589}
588#endif /* CONFIG_NF_CONNTRACK_EVENTS */ 590#endif /* CONFIG_NF_CONNTRACK_EVENTS */
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 8eb0cc23ada3..6afa3d52ea5f 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -113,9 +113,9 @@ int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 pid,
113} 113}
114EXPORT_SYMBOL_GPL(nfnetlink_send); 114EXPORT_SYMBOL_GPL(nfnetlink_send);
115 115
116void nfnetlink_set_err(struct net *net, u32 pid, u32 group, int error) 116int nfnetlink_set_err(struct net *net, u32 pid, u32 group, int error)
117{ 117{
118 netlink_set_err(net->nfnl, pid, group, error); 118 return netlink_set_err(net->nfnl, pid, group, error);
119} 119}
120EXPORT_SYMBOL_GPL(nfnetlink_set_err); 120EXPORT_SYMBOL_GPL(nfnetlink_set_err);
121 121
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 320d0423a240..acbbae1e89b5 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1093,6 +1093,7 @@ static inline int do_one_set_err(struct sock *sk,
1093 struct netlink_set_err_data *p) 1093 struct netlink_set_err_data *p)
1094{ 1094{
1095 struct netlink_sock *nlk = nlk_sk(sk); 1095 struct netlink_sock *nlk = nlk_sk(sk);
1096 int ret = 0;
1096 1097
1097 if (sk == p->exclude_sk) 1098 if (sk == p->exclude_sk)
1098 goto out; 1099 goto out;
@@ -1104,10 +1105,15 @@ static inline int do_one_set_err(struct sock *sk,
1104 !test_bit(p->group - 1, nlk->groups)) 1105 !test_bit(p->group - 1, nlk->groups))
1105 goto out; 1106 goto out;
1106 1107
1108 if (p->code == ENOBUFS && nlk->flags & NETLINK_RECV_NO_ENOBUFS) {
1109 ret = 1;
1110 goto out;
1111 }
1112
1107 sk->sk_err = p->code; 1113 sk->sk_err = p->code;
1108 sk->sk_error_report(sk); 1114 sk->sk_error_report(sk);
1109out: 1115out:
1110 return 0; 1116 return ret;
1111} 1117}
1112 1118
1113/** 1119/**
@@ -1116,12 +1122,16 @@ out:
1116 * @pid: the PID of a process that we want to skip (if any) 1122 * @pid: the PID of a process that we want to skip (if any)
1117 * @groups: the broadcast group that will notice the error 1123 * @groups: the broadcast group that will notice the error
1118 * @code: error code, must be negative (as usual in kernelspace) 1124 * @code: error code, must be negative (as usual in kernelspace)
1125 *
1126 * This function returns the number of broadcast listeners that have set the
1127 * NETLINK_RECV_NO_ENOBUFS socket option.
1119 */ 1128 */
1120void netlink_set_err(struct sock *ssk, u32 pid, u32 group, int code) 1129int netlink_set_err(struct sock *ssk, u32 pid, u32 group, int code)
1121{ 1130{
1122 struct netlink_set_err_data info; 1131 struct netlink_set_err_data info;
1123 struct hlist_node *node; 1132 struct hlist_node *node;
1124 struct sock *sk; 1133 struct sock *sk;
1134 int ret = 0;
1125 1135
1126 info.exclude_sk = ssk; 1136 info.exclude_sk = ssk;
1127 info.pid = pid; 1137 info.pid = pid;
@@ -1132,9 +1142,10 @@ void netlink_set_err(struct sock *ssk, u32 pid, u32 group, int code)
1132 read_lock(&nl_table_lock); 1142 read_lock(&nl_table_lock);
1133 1143
1134 sk_for_each_bound(sk, node, &nl_table[ssk->sk_protocol].mc_list) 1144 sk_for_each_bound(sk, node, &nl_table[ssk->sk_protocol].mc_list)
1135 do_one_set_err(sk, &info); 1145 ret += do_one_set_err(sk, &info);
1136 1146
1137 read_unlock(&nl_table_lock); 1147 read_unlock(&nl_table_lock);
1148 return ret;
1138} 1149}
1139EXPORT_SYMBOL(netlink_set_err); 1150EXPORT_SYMBOL(netlink_set_err);
1140 1151
diff --git a/net/rxrpc/ar-accept.c b/net/rxrpc/ar-accept.c
index 77228f28fa36..2d744f22a9a1 100644
--- a/net/rxrpc/ar-accept.c
+++ b/net/rxrpc/ar-accept.c
@@ -88,6 +88,11 @@ static int rxrpc_accept_incoming_call(struct rxrpc_local *local,
88 88
89 /* get a notification message to send to the server app */ 89 /* get a notification message to send to the server app */
90 notification = alloc_skb(0, GFP_NOFS); 90 notification = alloc_skb(0, GFP_NOFS);
91 if (!notification) {
92 _debug("no memory");
93 ret = -ENOMEM;
94 goto error_nofree;
95 }
91 rxrpc_new_skb(notification); 96 rxrpc_new_skb(notification);
92 notification->mark = RXRPC_SKB_MARK_NEW_CALL; 97 notification->mark = RXRPC_SKB_MARK_NEW_CALL;
93 98
@@ -189,6 +194,7 @@ invalid_service:
189 ret = -ECONNREFUSED; 194 ret = -ECONNREFUSED;
190error: 195error:
191 rxrpc_free_skb(notification); 196 rxrpc_free_skb(notification);
197error_nofree:
192 _leave(" = %d", ret); 198 _leave(" = %d", ret);
193 return ret; 199 return ret;
194} 200}
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 0cfccc2a0297..c389ccf6437d 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -1280,9 +1280,8 @@ alloc_enc_pages(struct rpc_rqst *rqstp)
1280 rqstp->rq_release_snd_buf = priv_release_snd_buf; 1280 rqstp->rq_release_snd_buf = priv_release_snd_buf;
1281 return 0; 1281 return 0;
1282out_free: 1282out_free:
1283 for (i--; i >= 0; i--) { 1283 rqstp->rq_enc_pages_num = i;
1284 __free_page(rqstp->rq_enc_pages[i]); 1284 priv_release_snd_buf(rqstp);
1285 }
1286out: 1285out:
1287 return -EAGAIN; 1286 return -EAGAIN;
1288} 1287}
diff --git a/net/sunrpc/bc_svc.c b/net/sunrpc/bc_svc.c
index 13f214f53120..f0c05d3311c1 100644
--- a/net/sunrpc/bc_svc.c
+++ b/net/sunrpc/bc_svc.c
@@ -37,21 +37,6 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 37
38#define RPCDBG_FACILITY RPCDBG_SVCDSP 38#define RPCDBG_FACILITY RPCDBG_SVCDSP
39 39
40void bc_release_request(struct rpc_task *task)
41{
42 struct rpc_rqst *req = task->tk_rqstp;
43
44 dprintk("RPC: bc_release_request: task= %p\n", task);
45
46 /*
47 * Release this request only if it's a backchannel
48 * preallocated request
49 */
50 if (!bc_prealloc(req))
51 return;
52 xprt_free_bc_request(req);
53}
54
55/* Empty callback ops */ 40/* Empty callback ops */
56static const struct rpc_call_ops nfs41_callback_ops = { 41static const struct rpc_call_ops nfs41_callback_ops = {
57}; 42};
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 154034b675bd..19c9983d5360 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -659,6 +659,7 @@ struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req,
659 task = rpc_new_task(&task_setup_data); 659 task = rpc_new_task(&task_setup_data);
660 if (!task) { 660 if (!task) {
661 xprt_free_bc_request(req); 661 xprt_free_bc_request(req);
662 task = ERR_PTR(-ENOMEM);
662 goto out; 663 goto out;
663 } 664 }
664 task->tk_rqstp = req; 665 task->tk_rqstp = req;
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 8d63f8fd29b7..20e30c6f8355 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -587,6 +587,8 @@ static struct dentry *__rpc_lookup_create_exclusive(struct dentry *parent,
587 struct dentry *dentry; 587 struct dentry *dentry;
588 588
589 dentry = __rpc_lookup_create(parent, name); 589 dentry = __rpc_lookup_create(parent, name);
590 if (IS_ERR(dentry))
591 return dentry;
590 if (dentry->d_inode == NULL) 592 if (dentry->d_inode == NULL)
591 return dentry; 593 return dentry;
592 dput(dentry); 594 dput(dentry);
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 469de292c23c..42f09ade0044 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -46,6 +46,7 @@
46 46
47#include <linux/sunrpc/clnt.h> 47#include <linux/sunrpc/clnt.h>
48#include <linux/sunrpc/metrics.h> 48#include <linux/sunrpc/metrics.h>
49#include <linux/sunrpc/bc_xprt.h>
49 50
50#include "sunrpc.h" 51#include "sunrpc.h"
51 52
@@ -1032,21 +1033,16 @@ void xprt_release(struct rpc_task *task)
1032 if (req->rq_release_snd_buf) 1033 if (req->rq_release_snd_buf)
1033 req->rq_release_snd_buf(req); 1034 req->rq_release_snd_buf(req);
1034 1035
1035 /*
1036 * Early exit if this is a backchannel preallocated request.
1037 * There is no need to have it added to the RPC slot list.
1038 */
1039 if (is_bc_request)
1040 return;
1041
1042 memset(req, 0, sizeof(*req)); /* mark unused */
1043
1044 dprintk("RPC: %5u release request %p\n", task->tk_pid, req); 1036 dprintk("RPC: %5u release request %p\n", task->tk_pid, req);
1037 if (likely(!is_bc_request)) {
1038 memset(req, 0, sizeof(*req)); /* mark unused */
1045 1039
1046 spin_lock(&xprt->reserve_lock); 1040 spin_lock(&xprt->reserve_lock);
1047 list_add(&req->rq_list, &xprt->free); 1041 list_add(&req->rq_list, &xprt->free);
1048 rpc_wake_up_next(&xprt->backlog); 1042 rpc_wake_up_next(&xprt->backlog);
1049 spin_unlock(&xprt->reserve_lock); 1043 spin_unlock(&xprt->reserve_lock);
1044 } else
1045 xprt_free_bc_request(req);
1050} 1046}
1051 1047
1052/** 1048/**
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index e4839c07c913..9847c30b5001 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2251,9 +2251,6 @@ static struct rpc_xprt_ops xs_tcp_ops = {
2251 .buf_free = rpc_free, 2251 .buf_free = rpc_free,
2252 .send_request = xs_tcp_send_request, 2252 .send_request = xs_tcp_send_request,
2253 .set_retrans_timeout = xprt_set_retrans_timeout_def, 2253 .set_retrans_timeout = xprt_set_retrans_timeout_def,
2254#if defined(CONFIG_NFS_V4_1)
2255 .release_request = bc_release_request,
2256#endif /* CONFIG_NFS_V4_1 */
2257 .close = xs_tcp_close, 2254 .close = xs_tcp_close,
2258 .destroy = xs_destroy, 2255 .destroy = xs_destroy,
2259 .print_stats = xs_tcp_print_stats, 2256 .print_stats = xs_tcp_print_stats,
diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl
index f76f3d13276d..6f97a13bcee4 100755
--- a/scripts/get_maintainer.pl
+++ b/scripts/get_maintainer.pl
@@ -284,7 +284,7 @@ foreach my $file (@ARGV) {
284 my $file_cnt = @files; 284 my $file_cnt = @files;
285 my $lastfile; 285 my $lastfile;
286 286
287 open(my $patch, '<', $file) 287 open(my $patch, "< $file")
288 or die "$P: Can't open $file: $!\n"; 288 or die "$P: Can't open $file: $!\n";
289 while (<$patch>) { 289 while (<$patch>) {
290 my $patch_line = $_; 290 my $patch_line = $_;
diff --git a/scripts/kernel-doc b/scripts/kernel-doc
index c7865c362d28..fcdfb245a575 100755
--- a/scripts/kernel-doc
+++ b/scripts/kernel-doc
@@ -1424,6 +1424,8 @@ sub dump_struct($$) {
1424 $nested =~ s/\/\*.*?\*\///gos; 1424 $nested =~ s/\/\*.*?\*\///gos;
1425 # strip kmemcheck_bitfield_{begin,end}.*; 1425 # strip kmemcheck_bitfield_{begin,end}.*;
1426 $members =~ s/kmemcheck_bitfield_.*?;//gos; 1426 $members =~ s/kmemcheck_bitfield_.*?;//gos;
1427 # strip attributes
1428 $members =~ s/__aligned\s*\(\d+\)//gos;
1427 1429
1428 create_parameterlist($members, ';', $file); 1430 create_parameterlist($members, ';', $file);
1429 check_sections($file, $declaration_name, "struct", $sectcheck, $struct_actual, $nested); 1431 check_sections($file, $declaration_name, "struct", $sectcheck, $struct_actual, $nested);
@@ -1728,6 +1730,7 @@ sub dump_function($$) {
1728 $prototype =~ s/^noinline +//; 1730 $prototype =~ s/^noinline +//;
1729 $prototype =~ s/__devinit +//; 1731 $prototype =~ s/__devinit +//;
1730 $prototype =~ s/__init +//; 1732 $prototype =~ s/__init +//;
1733 $prototype =~ s/__init_or_module +//;
1731 $prototype =~ s/^#\s*define\s+//; #ak added 1734 $prototype =~ s/^#\s*define\s+//; #ak added
1732 $prototype =~ s/__attribute__\s*\(\([a-z,]*\)\)//; 1735 $prototype =~ s/__attribute__\s*\(\([a-z,]*\)\)//;
1733 1736
diff --git a/sound/arm/pxa2xx-pcm-lib.c b/sound/arm/pxa2xx-pcm-lib.c
index 743ac6a29065..fd51fa8b06a1 100644
--- a/sound/arm/pxa2xx-pcm-lib.c
+++ b/sound/arm/pxa2xx-pcm-lib.c
@@ -205,6 +205,7 @@ int __pxa2xx_pcm_open(struct snd_pcm_substream *substream)
205 if (!rtd->dma_desc_array) 205 if (!rtd->dma_desc_array)
206 goto err1; 206 goto err1;
207 207
208 rtd->dma_ch = -1;
208 runtime->private_data = rtd; 209 runtime->private_data = rtd;
209 return 0; 210 return 0;
210 211
diff --git a/sound/oss/vidc.c b/sound/oss/vidc.c
index 725fef0f59a3..a4127bab9231 100644
--- a/sound/oss/vidc.c
+++ b/sound/oss/vidc.c
@@ -363,13 +363,13 @@ static void vidc_audio_trigger(int dev, int enable_bits)
363 struct audio_operations *adev = audio_devs[dev]; 363 struct audio_operations *adev = audio_devs[dev];
364 364
365 if (enable_bits & PCM_ENABLE_OUTPUT) { 365 if (enable_bits & PCM_ENABLE_OUTPUT) {
366 if (!(adev->flags & DMA_ACTIVE)) { 366 if (!(adev->dmap_out->flags & DMA_ACTIVE)) {
367 unsigned long flags; 367 unsigned long flags;
368 368
369 local_irq_save(flags); 369 local_irq_save(flags);
370 370
371 /* prevent recusion */ 371 /* prevent recusion */
372 adev->flags |= DMA_ACTIVE; 372 adev->dmap_out->flags |= DMA_ACTIVE;
373 373
374 dma_interrupt = vidc_audio_dma_interrupt; 374 dma_interrupt = vidc_audio_dma_interrupt;
375 vidc_sound_dma_irq(0, NULL); 375 vidc_sound_dma_irq(0, NULL);
diff --git a/sound/pci/cmipci.c b/sound/pci/cmipci.c
index 1ded64e05643..329968edca9b 100644
--- a/sound/pci/cmipci.c
+++ b/sound/pci/cmipci.c
@@ -941,13 +941,21 @@ static snd_pcm_uframes_t snd_cmipci_pcm_pointer(struct cmipci *cm, struct cmipci
941 struct snd_pcm_substream *substream) 941 struct snd_pcm_substream *substream)
942{ 942{
943 size_t ptr; 943 size_t ptr;
944 unsigned int reg; 944 unsigned int reg, rem, tries;
945
945 if (!rec->running) 946 if (!rec->running)
946 return 0; 947 return 0;
947#if 1 // this seems better.. 948#if 1 // this seems better..
948 reg = rec->ch ? CM_REG_CH1_FRAME2 : CM_REG_CH0_FRAME2; 949 reg = rec->ch ? CM_REG_CH1_FRAME2 : CM_REG_CH0_FRAME2;
949 ptr = rec->dma_size - (snd_cmipci_read_w(cm, reg) + 1); 950 for (tries = 0; tries < 3; tries++) {
950 ptr >>= rec->shift; 951 rem = snd_cmipci_read_w(cm, reg);
952 if (rem < rec->dma_size)
953 goto ok;
954 }
955 printk(KERN_ERR "cmipci: invalid PCM pointer: %#x\n", rem);
956 return SNDRV_PCM_POS_XRUN;
957ok:
958 ptr = (rec->dma_size - (rem + 1)) >> rec->shift;
951#else 959#else
952 reg = rec->ch ? CM_REG_CH1_FRAME1 : CM_REG_CH0_FRAME1; 960 reg = rec->ch ? CM_REG_CH1_FRAME1 : CM_REG_CH0_FRAME1;
953 ptr = snd_cmipci_read(cm, reg) - rec->offset; 961 ptr = snd_cmipci_read(cm, reg) - rec->offset;
diff --git a/sound/pci/hda/patch_conexant.c b/sound/pci/hda/patch_conexant.c
index 194a28c54992..61682e1d09da 100644
--- a/sound/pci/hda/patch_conexant.c
+++ b/sound/pci/hda/patch_conexant.c
@@ -1591,6 +1591,21 @@ static int patch_cxt5047(struct hda_codec *codec)
1591#endif 1591#endif
1592 } 1592 }
1593 spec->vmaster_nid = 0x13; 1593 spec->vmaster_nid = 0x13;
1594
1595 switch (codec->subsystem_id >> 16) {
1596 case 0x103c:
1597 /* HP laptops have really bad sound over 0 dB on NID 0x10.
1598 * Fix max PCM level to 0 dB (originally it has 0x1e steps
1599 * with 0 dB offset 0x17)
1600 */
1601 snd_hda_override_amp_caps(codec, 0x10, HDA_INPUT,
1602 (0x17 << AC_AMPCAP_OFFSET_SHIFT) |
1603 (0x17 << AC_AMPCAP_NUM_STEPS_SHIFT) |
1604 (0x05 << AC_AMPCAP_STEP_SIZE_SHIFT) |
1605 (1 << AC_AMPCAP_MUTE_SHIFT));
1606 break;
1607 }
1608
1594 return 0; 1609 return 0;
1595} 1610}
1596 1611
diff --git a/sound/pci/hda/patch_nvhdmi.c b/sound/pci/hda/patch_nvhdmi.c
index 70669a246902..3c10c0b149f4 100644
--- a/sound/pci/hda/patch_nvhdmi.c
+++ b/sound/pci/hda/patch_nvhdmi.c
@@ -538,8 +538,6 @@ static int patch_nvhdmi_2ch(struct hda_codec *codec)
538 * patch entries 538 * patch entries
539 */ 539 */
540static struct hda_codec_preset snd_hda_preset_nvhdmi[] = { 540static struct hda_codec_preset snd_hda_preset_nvhdmi[] = {
541 { .id = 0x10de0067, .name = "MCP67 HDMI", .patch = patch_nvhdmi_2ch },
542 { .id = 0x10de8001, .name = "MCP73 HDMI", .patch = patch_nvhdmi_2ch },
543 { .id = 0x10de0002, .name = "MCP77/78 HDMI", 541 { .id = 0x10de0002, .name = "MCP77/78 HDMI",
544 .patch = patch_nvhdmi_8ch_7x }, 542 .patch = patch_nvhdmi_8ch_7x },
545 { .id = 0x10de0003, .name = "MCP77/78 HDMI", 543 { .id = 0x10de0003, .name = "MCP77/78 HDMI",
@@ -550,12 +548,16 @@ static struct hda_codec_preset snd_hda_preset_nvhdmi[] = {
550 .patch = patch_nvhdmi_8ch_7x }, 548 .patch = patch_nvhdmi_8ch_7x },
551 { .id = 0x10de0007, .name = "MCP79/7A HDMI", 549 { .id = 0x10de0007, .name = "MCP79/7A HDMI",
552 .patch = patch_nvhdmi_8ch_7x }, 550 .patch = patch_nvhdmi_8ch_7x },
553 { .id = 0x10de000c, .name = "MCP89 HDMI", 551 { .id = 0x10de000a, .name = "GT220 HDMI",
554 .patch = patch_nvhdmi_8ch_89 }, 552 .patch = patch_nvhdmi_8ch_89 },
555 { .id = 0x10de000b, .name = "GT21x HDMI", 553 { .id = 0x10de000b, .name = "GT21x HDMI",
556 .patch = patch_nvhdmi_8ch_89 }, 554 .patch = patch_nvhdmi_8ch_89 },
555 { .id = 0x10de000c, .name = "MCP89 HDMI",
556 .patch = patch_nvhdmi_8ch_89 },
557 { .id = 0x10de000d, .name = "GT240 HDMI", 557 { .id = 0x10de000d, .name = "GT240 HDMI",
558 .patch = patch_nvhdmi_8ch_89 }, 558 .patch = patch_nvhdmi_8ch_89 },
559 { .id = 0x10de0067, .name = "MCP67 HDMI", .patch = patch_nvhdmi_2ch },
560 { .id = 0x10de8001, .name = "MCP73 HDMI", .patch = patch_nvhdmi_2ch },
559 {} /* terminator */ 561 {} /* terminator */
560}; 562};
561 563
@@ -564,11 +566,12 @@ MODULE_ALIAS("snd-hda-codec-id:10de0003");
564MODULE_ALIAS("snd-hda-codec-id:10de0005"); 566MODULE_ALIAS("snd-hda-codec-id:10de0005");
565MODULE_ALIAS("snd-hda-codec-id:10de0006"); 567MODULE_ALIAS("snd-hda-codec-id:10de0006");
566MODULE_ALIAS("snd-hda-codec-id:10de0007"); 568MODULE_ALIAS("snd-hda-codec-id:10de0007");
567MODULE_ALIAS("snd-hda-codec-id:10de0067"); 569MODULE_ALIAS("snd-hda-codec-id:10de000a");
568MODULE_ALIAS("snd-hda-codec-id:10de8001");
569MODULE_ALIAS("snd-hda-codec-id:10de000c");
570MODULE_ALIAS("snd-hda-codec-id:10de000b"); 570MODULE_ALIAS("snd-hda-codec-id:10de000b");
571MODULE_ALIAS("snd-hda-codec-id:10de000c");
571MODULE_ALIAS("snd-hda-codec-id:10de000d"); 572MODULE_ALIAS("snd-hda-codec-id:10de000d");
573MODULE_ALIAS("snd-hda-codec-id:10de0067");
574MODULE_ALIAS("snd-hda-codec-id:10de8001");
572 575
573MODULE_LICENSE("GPL"); 576MODULE_LICENSE("GPL");
574MODULE_DESCRIPTION("NVIDIA HDMI HD-audio codec"); 577MODULE_DESCRIPTION("NVIDIA HDMI HD-audio codec");
diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c
index 4ec57633af88..053d53d8c8b2 100644
--- a/sound/pci/hda/patch_realtek.c
+++ b/sound/pci/hda/patch_realtek.c
@@ -2532,8 +2532,6 @@ static int alc_build_controls(struct hda_codec *codec)
2532 return err; 2532 return err;
2533 } 2533 }
2534 2534
2535 alc_free_kctls(codec); /* no longer needed */
2536
2537 /* assign Capture Source enums to NID */ 2535 /* assign Capture Source enums to NID */
2538 kctl = snd_hda_find_mixer_ctl(codec, "Capture Source"); 2536 kctl = snd_hda_find_mixer_ctl(codec, "Capture Source");
2539 if (!kctl) 2537 if (!kctl)
@@ -2602,6 +2600,9 @@ static int alc_build_controls(struct hda_codec *codec)
2602 } 2600 }
2603 } 2601 }
2604 } 2602 }
2603
2604 alc_free_kctls(codec); /* no longer needed */
2605
2605 return 0; 2606 return 0;
2606} 2607}
2607 2608
diff --git a/sound/pci/hda/patch_sigmatel.c b/sound/pci/hda/patch_sigmatel.c
index 8c416bb18a57..c4be3fab94e5 100644
--- a/sound/pci/hda/patch_sigmatel.c
+++ b/sound/pci/hda/patch_sigmatel.c
@@ -1730,6 +1730,8 @@ static struct snd_pci_quirk stac92hd71bxx_cfg_tbl[] = {
1730 "HP HDX", STAC_HP_HDX), /* HDX16 */ 1730 "HP HDX", STAC_HP_HDX), /* HDX16 */
1731 SND_PCI_QUIRK_MASK(PCI_VENDOR_ID_HP, 0xfff0, 0x3620, 1731 SND_PCI_QUIRK_MASK(PCI_VENDOR_ID_HP, 0xfff0, 0x3620,
1732 "HP dv6", STAC_HP_DV5), 1732 "HP dv6", STAC_HP_DV5),
1733 SND_PCI_QUIRK(PCI_VENDOR_ID_HP, 0x3061,
1734 "HP dv6", STAC_HP_DV5), /* HP dv6-1110ax */
1733 SND_PCI_QUIRK_MASK(PCI_VENDOR_ID_HP, 0xfff0, 0x7010, 1735 SND_PCI_QUIRK_MASK(PCI_VENDOR_ID_HP, 0xfff0, 0x7010,
1734 "HP", STAC_HP_DV5), 1736 "HP", STAC_HP_DV5),
1735 SND_PCI_QUIRK(PCI_VENDOR_ID_DELL, 0x0233, 1737 SND_PCI_QUIRK(PCI_VENDOR_ID_DELL, 0x0233,
diff --git a/sound/soc/codecs/tlv320dac33.c b/sound/soc/codecs/tlv320dac33.c
index f9f367d29a90..d50f1699ccb2 100644
--- a/sound/soc/codecs/tlv320dac33.c
+++ b/sound/soc/codecs/tlv320dac33.c
@@ -778,7 +778,7 @@ static int dac33_prepare_chip(struct snd_pcm_substream *substream)
778 if (dac33->fifo_mode) { 778 if (dac33->fifo_mode) {
779 /* Generic for all FIFO modes */ 779 /* Generic for all FIFO modes */
780 /* 50-51 : ASRC Control registers */ 780 /* 50-51 : ASRC Control registers */
781 dac33_write(codec, DAC33_ASRC_CTRL_A, (1 << 4)); /* div=2 */ 781 dac33_write(codec, DAC33_ASRC_CTRL_A, DAC33_SRCLKDIV(1));
782 dac33_write(codec, DAC33_ASRC_CTRL_B, 1); /* ??? */ 782 dac33_write(codec, DAC33_ASRC_CTRL_B, 1); /* ??? */
783 783
784 /* Write registers 0x34 and 0x35 (MSB, LSB) */ 784 /* Write registers 0x34 and 0x35 (MSB, LSB) */
@@ -1038,11 +1038,7 @@ static int dac33_set_dai_fmt(struct snd_soc_dai *codec_dai,
1038 case SND_SOC_DAIFMT_DSP_A: 1038 case SND_SOC_DAIFMT_DSP_A:
1039 aictrl_a |= DAC33_AFMT_DSP; 1039 aictrl_a |= DAC33_AFMT_DSP;
1040 aictrl_b &= ~DAC33_DATA_DELAY_MASK; 1040 aictrl_b &= ~DAC33_DATA_DELAY_MASK;
1041 aictrl_b |= DAC33_DATA_DELAY(1); /* 1 bit delay */ 1041 aictrl_b |= DAC33_DATA_DELAY(0);
1042 break;
1043 case SND_SOC_DAIFMT_DSP_B:
1044 aictrl_a |= DAC33_AFMT_DSP;
1045 aictrl_b &= ~DAC33_DATA_DELAY_MASK; /* No delay */
1046 break; 1042 break;
1047 case SND_SOC_DAIFMT_RIGHT_J: 1043 case SND_SOC_DAIFMT_RIGHT_J:
1048 aictrl_a |= DAC33_AFMT_RIGHT_J; 1044 aictrl_a |= DAC33_AFMT_RIGHT_J;
@@ -1066,7 +1062,7 @@ static void dac33_init_chip(struct snd_soc_codec *codec)
1066{ 1062{
1067 /* 44-46: DAC Control Registers */ 1063 /* 44-46: DAC Control Registers */
1068 /* A : DAC sample rate Fsref/1.5 */ 1064 /* A : DAC sample rate Fsref/1.5 */
1069 dac33_write(codec, DAC33_DAC_CTRL_A, DAC33_DACRATE(1)); 1065 dac33_write(codec, DAC33_DAC_CTRL_A, DAC33_DACRATE(0));
1070 /* B : DAC src=normal, not muted */ 1066 /* B : DAC src=normal, not muted */
1071 dac33_write(codec, DAC33_DAC_CTRL_B, DAC33_DACSRCR_RIGHT | 1067 dac33_write(codec, DAC33_DAC_CTRL_B, DAC33_DACSRCR_RIGHT |
1072 DAC33_DACSRCL_LEFT); 1068 DAC33_DACSRCL_LEFT);
diff --git a/sound/soc/codecs/wm_hubs.c b/sound/soc/codecs/wm_hubs.c
index 0ad9f5d536c6..486bdd21a98a 100644
--- a/sound/soc/codecs/wm_hubs.c
+++ b/sound/soc/codecs/wm_hubs.c
@@ -74,7 +74,7 @@ static void wait_for_dc_servo(struct snd_soc_codec *codec)
74 msleep(1); 74 msleep(1);
75 reg = snd_soc_read(codec, WM8993_DC_SERVO_READBACK_0); 75 reg = snd_soc_read(codec, WM8993_DC_SERVO_READBACK_0);
76 dev_dbg(codec->dev, "DC servo: %x\n", reg); 76 dev_dbg(codec->dev, "DC servo: %x\n", reg);
77 } while (reg & WM8993_DCS_DATAPATH_BUSY); 77 } while (reg & WM8993_DCS_DATAPATH_BUSY && count < 400);
78 78
79 if (reg & WM8993_DCS_DATAPATH_BUSY) 79 if (reg & WM8993_DCS_DATAPATH_BUSY)
80 dev_err(codec->dev, "Timed out waiting for DC Servo\n"); 80 dev_err(codec->dev, "Timed out waiting for DC Servo\n");
diff --git a/sound/soc/imx/Kconfig b/sound/soc/imx/Kconfig
index c7d0fd9b7de8..7174b4c710de 100644
--- a/sound/soc/imx/Kconfig
+++ b/sound/soc/imx/Kconfig
@@ -1,6 +1,6 @@
1config SND_IMX_SOC 1config SND_IMX_SOC
2 tristate "SoC Audio for Freescale i.MX CPUs" 2 tristate "SoC Audio for Freescale i.MX CPUs"
3 depends on ARCH_MXC && BROKEN 3 depends on ARCH_MXC
4 select SND_PCM 4 select SND_PCM
5 select FIQ 5 select FIQ
6 select SND_SOC_AC97_BUS 6 select SND_SOC_AC97_BUS
diff --git a/sound/soc/sh/Kconfig b/sound/soc/sh/Kconfig
index 106674979b53..f07f6d8b93e1 100644
--- a/sound/soc/sh/Kconfig
+++ b/sound/soc/sh/Kconfig
@@ -32,6 +32,7 @@ config SND_SOC_SH4_SIU
32 select DMA_ENGINE 32 select DMA_ENGINE
33 select DMADEVICES 33 select DMADEVICES
34 select SH_DMAE 34 select SH_DMAE
35 select FW_LOADER
35 36
36## 37##
37## Boards 38## Boards